4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <unordered_map>
52 #include <nbdkit-filter.h>
57 #include "windows-compat.h"
59 static char *filename
;
62 static struct timeval start_t
;
63 static double print_threshold
= 0.95;
72 /* Keeps track of the number of request sizes and alignments. Requests
73 * are split into buckets by the number of bits needed to represent
74 * their size (i.e., floor(log2(req_size))), and the number
75 * of trailing zero-bits in the offset.
77 * The outer map is indexed by size bits, the inner by alignment bits.
78 * The value is the count of such requests. */
79 std::unordered_map
<int,
80 std::unordered_map
<int, uint64_t>> count
;
82 /* Keeps tracks of the aggregated size of all requests in a given
83 * request size bucket. */
84 std::unordered_map
<int, uint64_t> size
;
87 /* This lock protects all the stats. */
88 static pthread_mutex_t lock
= PTHREAD_MUTEX_INITIALIZER
;
89 static nbdstat pread_st
= { "read" };
90 static nbdstat pwrite_st
= { "write" };
91 static nbdstat trim_st
= { "trim" };
92 static nbdstat zero_st
= { "zero" };
93 static nbdstat extents_st
= { "extents" };
94 static nbdstat cache_st
= { "cache" };
95 static nbdstat flush_st
= { "flush" };
100 #define GiB 1073741824
103 get_alignment (uint64_t offset
)
105 /* Cache most common alignments */
106 static int powers
[] = {
107 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
109 static uint64_t masks
[] = {
110 0, (1 << 1) - 1, (1 << 2) - 1, (1 << 3) - 1, (1 << 4) - 1, (1 << 5) - 1, (1 << 6) - 1,
111 (1 << 7) - 1, (1 << 8) - 1, (1 << 9) - 1, (1 << 10) - 1, (1 << 11) - 1, (1 << 12) - 1,
112 (1 << 13) - 1, (1 << 14) - 1
115 // Can't determine an alignment for 0, so use a special flag value.
120 while(++i
< static_cast<int> (sizeof (powers
)/sizeof(powers
[0]))) {
121 if ((offset
& masks
[i
]) != 0)
127 uint64_t mask
= (1ul << i
) - 1;
128 if ((offset
& mask
) != 0)
135 humansize (uint64_t bytes
)
141 r
= asprintf (&ret
, "%" PRIu64
" bytes", bytes
);
142 else if (bytes
< MiB
)
143 r
= asprintf (&ret
, "%.2f KiB", bytes
/ (double)KiB
);
144 else if (bytes
< GiB
)
145 r
= asprintf (&ret
, "%.2f MiB", bytes
/ (double)MiB
);
147 r
= asprintf (&ret
, "%.2f GiB", bytes
/ (double)GiB
);
154 humanrate (uint64_t bytes
, uint64_t usecs
)
156 double secs
= usecs
/ 1000000.0;
157 return secs
!= 0.0 ? humansize (bytes
/ secs
) : NULL
;
160 static inline const char *
163 return s
? s
: "(n/a)";
167 print_align_hist(const std::unordered_map
<int, uint64_t>& align_map
)
169 /* Convert to ordered map (convenient, since we need to mutate counts),
170 * find requests for offset zero (any alignment), and calculate total. */
171 std::map
<int, uint64_t> align_hist
;
172 uint64_t any_align_count
= 0;
174 for (auto &el
: align_map
) {
176 auto requests
= el
.second
;
178 any_align_count
= requests
;
180 align_hist
[bits
] = requests
;
185 /* "Fix-up" alignment counts (requests with 8-bit alignment also have
186 * 7-bit alignment, 6-bit alignment, etc) */
187 for (auto &el
: align_hist
) {
189 auto requests
= el
.second
;
190 while (--bits
>= 0) {
191 auto it
= align_hist
.find(bits
);
192 if (it
!= align_hist
.end())
193 it
->second
+= requests
;
195 el
.second
+= any_align_count
;
198 /* The smallest alignment must have the largest number of requests, so we
199 * can iterate in map-order, skipping over bits for which the number of
200 * requests does not change */
201 auto it
= align_hist
.begin();
202 auto cutoff
= static_cast<uint64_t> ((1-print_threshold
) * total
);
203 while(it
!= align_hist
.end()) {
204 auto bits
= it
->first
;
205 auto requests
= it
->second
;
207 if (requests
< cutoff
) {
208 fprintf (fp
, " %2d+ bit-aligned: %4.1f%% (%" PRIu64
")\n",
209 bits
, static_cast<double> (requests
) / total
* 100, requests
);
213 // Only print if number of requests differs from the next alignment
215 if (it
== align_hist
.end() || it
->second
!= requests
) {
216 fprintf (fp
, " %2d bit aligned: %5.1f%% (%" PRIu64
")\n",
217 bits
, static_cast<double>(requests
*100) / total
, requests
);
223 print_histogram (const nbdstat
*st
)
225 // Aggregate over alignment and invert map (so counts are keys and
226 // request size bits are values)
228 std::map
<uint64_t, int> req_count_size_m
;
229 for (auto &el1
: st
->count
) {
230 auto &align_map
= el1
.second
;
231 uint64_t requests
= 0;
232 for (auto &el2
: align_map
) {
233 requests
+= el2
.second
;
235 req_count_size_m
[requests
] = el1
.first
;
236 total
+= static_cast<double> (requests
);
238 if (st
->ops
!= static_cast<uint64_t> (total
)) {
239 fprintf(stderr
, "INTERNAL ERROR: per-bucket count (%f) does "
240 "not match total (%" PRIu64
")!\n", total
, st
->ops
);
244 /* Print block sizes until we have covered the *print_threshold* percentile */
245 auto to_print
= static_cast<uint64_t> (print_threshold
* total
);
246 uint64_t printed_reqs
= 0, printed_sizes
= 0;
247 for (auto it
= req_count_size_m
.rbegin(); it
!= req_count_size_m
.rend(); it
++) {
248 if (printed_reqs
>= to_print
) {
249 auto requests
= st
->ops
- printed_reqs
;
250 char *total_size
= humansize(st
->bytes
- printed_sizes
);
251 fprintf (fp
, " other sizes: %4.1f%% (%" PRIu64
" reqs, %s total)\n",
252 static_cast<double> (requests
) / total
* 100,
253 requests
, total_size
);
258 auto bits
= it
->second
;
259 auto requests
= it
->first
;
260 char *total_size
= humansize(st
->size
.at(bits
));
261 fprintf (fp
, " %2d bits: %4.1f%% (%" PRIu64
" reqs, %s total)\n", bits
,
262 static_cast<double> (requests
) / total
* 100, requests
,
265 printed_reqs
+= requests
;
266 total_size
+= st
->size
.at(bits
);
268 print_align_hist (st
->count
.at(bits
));
273 print_stat (const nbdstat
*st
, int64_t usecs
)
276 char *size
= humansize (st
->bytes
);
277 char *op_rate
= humanrate (st
->bytes
, st
->usecs
);
278 char *total_rate
= humanrate (st
->bytes
, usecs
);
280 fprintf (fp
, "%s: %" PRIu64
" ops, %.6f s, %s, %s/s op, %s/s total\n",
281 st
->name
, st
->ops
, st
->usecs
/ 1000000.0, maybe (size
),
282 maybe (op_rate
), maybe (total_rate
));
288 if (print_threshold
!= 0 and st
->count
.size() != 0) {
289 fprintf (fp
, " Request size and alignment breakdown:\n"),
290 print_histogram (st
);
297 print_totals (uint64_t usecs
)
299 uint64_t ops
= pread_st
.ops
+ pwrite_st
.ops
+ trim_st
.ops
+ zero_st
.ops
+
300 extents_st
.ops
+ flush_st
.ops
;
301 uint64_t bytes
= pread_st
.bytes
+ pwrite_st
.bytes
+ trim_st
.bytes
+
303 char *size
= humansize (bytes
);
304 char *rate
= humanrate (bytes
, usecs
);
306 fprintf (fp
, "total: %" PRIu64
" ops, %.6f s, %s, %s/s\n",
307 ops
, usecs
/ 1000000.0, maybe (size
), maybe (rate
));
314 print_stats (int64_t usecs
)
316 print_totals (usecs
);
317 print_stat (&pread_st
, usecs
);
318 print_stat (&pwrite_st
, usecs
);
319 print_stat (&trim_st
, usecs
);
320 print_stat (&zero_st
, usecs
);
321 print_stat (&extents_st
, usecs
);
322 print_stat (&cache_st
, usecs
);
323 print_stat (&flush_st
, usecs
);
333 gettimeofday (&now
, NULL
);
334 usecs
= tvdiff_usec (&start_t
, &now
);
335 if (fp
&& usecs
> 0) {
336 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
346 stats_config (nbdkit_next_config
*next
, nbdkit_backend
*nxdata
,
347 const char *key
, const char *value
)
351 if (strcmp (key
, "statsfile") == 0) {
353 filename
= nbdkit_absolute_path (value
);
354 if (filename
== NULL
)
358 else if (strcmp (key
, "statsappend") == 0) {
359 r
= nbdkit_parse_bool (value
);
365 else if (strcmp (key
, "statsthreshold") == 0) {
367 r
= nbdkit_parse_int ("printing threshold", value
, &ival
);
370 if (ival
> 100 or ival
< 0) {
371 nbdkit_error ("statsthreshold must be between 0 and 100 (percent)");
374 print_threshold
= static_cast<double>(ival
) / 100;
378 return next (nxdata
, key
, value
);
382 stats_config_complete (nbdkit_next_config_complete
*next
,
383 nbdkit_backend
*nxdata
)
385 if (filename
== NULL
) {
386 nbdkit_error ("stats filter requires statsfile parameter");
390 return next (nxdata
);
394 stats_get_ready (int thread_model
)
398 /* Using fopen("ae"/"we") would be more convenient, but as Haiku
399 * still lacks that, use this instead. Atomicity is not essential
400 * here since .config completes before threads that might fork, if
401 * we have to later add yet another fallback to fcntl(fileno()) for
402 * systems without O_CLOEXEC.
405 O_CLOEXEC
| O_WRONLY
| O_CREAT
| (append
? O_APPEND
: O_TRUNC
),
408 nbdkit_error ("open: %s: %m", filename
);
411 fp
= fdopen (fd
, append
? "a" : "w");
413 nbdkit_error ("fdopen: %s: %m", filename
);
417 gettimeofday (&start_t
, NULL
);
422 #define stats_config_help \
423 "statsfile=<FILE> (required) The file to place the log in.\n" \
424 "statsappend=<BOOL> True to append to the log (default false).\n"
427 record_stat (nbdstat
*st
, uint32_t size
, uint64_t offset
,
428 const struct timeval
*start
)
433 gettimeofday (&end
, NULL
);
434 usecs
= tvdiff_usec (start
, &end
);
436 // fast path if not collecting histogram data
437 static bool out_of_memory
= false;
438 if (out_of_memory
|| print_threshold
== 0 || size
== 0) {
439 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
446 // Calculate bits needed to represent request size
453 // Calculate trailing zero bits
454 int align_bits
= get_alignment (offset
);
456 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
462 st
->count
[size_bits
][align_bits
]++;
463 st
->size
[size_bits
] += size
;
465 catch (std::bad_alloc
const&) {
466 // Avoid reporting the same error over and over again
467 nbdkit_error ("out of memory for request size statistics");
468 out_of_memory
= true;
474 stats_pread (nbdkit_next
*next
,
475 void *handle
, void *buf
, uint32_t count
, uint64_t offset
,
476 uint32_t flags
, int *err
)
478 struct timeval start
;
481 gettimeofday (&start
, NULL
);
482 r
= next
->pread (next
, buf
, count
, offset
, flags
, err
);
483 if (r
== 0) record_stat (&pread_st
, count
, offset
, &start
);
489 stats_pwrite (nbdkit_next
*next
,
491 const void *buf
, uint32_t count
, uint64_t offset
,
492 uint32_t flags
, int *err
)
494 struct timeval start
;
497 gettimeofday (&start
, NULL
);
498 r
= next
->pwrite (next
, buf
, count
, offset
, flags
, err
);
499 if (r
== 0) record_stat (&pwrite_st
, count
, offset
, &start
);
505 stats_trim (nbdkit_next
*next
,
507 uint32_t count
, uint64_t offset
, uint32_t flags
,
510 struct timeval start
;
513 gettimeofday (&start
, NULL
);
514 r
= next
->trim (next
, count
, offset
, flags
, err
);
515 if (r
== 0) record_stat (&trim_st
, count
, offset
, &start
);
521 stats_flush (nbdkit_next
*next
,
522 void *handle
, uint32_t flags
,
525 struct timeval start
;
528 gettimeofday (&start
, NULL
);
529 r
= next
->flush (next
, flags
, err
);
530 if (r
== 0) record_stat (&flush_st
, 0, 0, &start
);
536 stats_zero (nbdkit_next
*next
,
538 uint32_t count
, uint64_t offset
, uint32_t flags
,
541 struct timeval start
;
544 gettimeofday (&start
, NULL
);
545 r
= next
->zero (next
, count
, offset
, flags
, err
);
546 if (r
== 0) record_stat (&zero_st
, count
, offset
, &start
);
552 stats_extents (nbdkit_next
*next
,
554 uint32_t count
, uint64_t offset
, uint32_t flags
,
555 struct nbdkit_extents
*extents
, int *err
)
557 struct timeval start
;
560 gettimeofday (&start
, NULL
);
561 r
= next
->extents (next
, count
, offset
, flags
, extents
, err
);
562 /* XXX There's a case for trying to determine how long the extents
563 * will be that are returned to the client (instead of simply using
564 * count), given the flags and the complex rules in the protocol.
566 if (r
== 0) record_stat (&extents_st
, count
, offset
, &start
);
572 stats_cache (nbdkit_next
*next
,
574 uint32_t count
, uint64_t offset
, uint32_t flags
,
577 struct timeval start
;
580 gettimeofday (&start
, NULL
);
581 r
= next
->cache (next
, count
, offset
, flags
, err
);
582 if (r
== 0) record_stat (&cache_st
, count
, offset
, &start
);
586 static struct nbdkit_filter filter
= []() -> nbdkit_filter
{
587 auto f
= nbdkit_filter();
589 f
.longname
= "nbdkit stats filter";
590 f
.unload
= stats_unload
;
591 f
.config
= stats_config
;
592 f
.config_complete
= stats_config_complete
;
593 f
.config_help
= stats_config_help
;
594 f
.get_ready
= stats_get_ready
;
595 f
.pread
= stats_pread
;
596 f
.pwrite
= stats_pwrite
;
597 f
.flush
= stats_flush
;
600 f
.extents
= stats_extents
;
601 f
.cache
= stats_cache
;
606 NBDKIT_REGISTER_FILTER(filter
)