Checking in changes prior to tagging of version 2.30.
[MogileFS-Utils.git] / mogstats
blob1e2bfec921eb43e3aea1076c74e29fd64671630b
1 #!/usr/bin/perl
2 # vim:ts=4 sw=4 ft=perl et:
3 # TODO:
4 # YAML display mode
5 # timing output during fetch
6 # support different db backends better (something mildly generic).
8 =head1 NAME
10 mogstats -- Utility for calculating slow stats directly against a MogileFS DB
12 =head1 SYNOPSIS
14 $ mogstats --db_dsn="DBI:mysql:mfs:host=mfshost" --db_user="mfs" \
15 --db_pass="mfs" --verbose --stats="devices,files"
16 $ mogstats --stats="all"
17 $ mogstats [all options in ~/.mogilefs.conf]
19 =head1 DESCRIPTION
21 Utility for inspecting queues and running general statistics against a
22 MogileFS database. Some of these stats can take a very long time to run
23 against a large instance, so the utility can be pointed at a read slave or
24 special account.
26 =head1 OPTIONS
28 =over
30 =item --db_dsn=<DBI_dsn>
32 The DSN to use for connecting to the MogileFS database server.
34 =item --db_user=<username>
36 A database user for connecting to the database.
38 =item --db_pass=<password>
40 An optional password for the database user.
42 =item --config=<file>
44 An explicit config file to use. By default /etc/mogilefs/mogilefs.conf and
45 ~/.mogilefs.conf are checked.
47 =item --verbose
49 Print some extra text during processing. Mostly notes about what stats are
50 starting or finishing.
52 =item --stats=<stats>
54 A list of which statistics to calculate. Notes on some of them are listed
55 below, see --help for full list. A value of "all" fetches all possible stats.
57 =item --help
59 List usage info and supported statistics.
61 =back
63 =head1 AVAILABLE STATISTICS
65 Contains notes on which stats may be fast or slow.
67 =over
69 =item devices
71 Lists count of files and current database status per-device. Can be very slow.
73 =item fids
75 Lists the current highest file id. Should be fast.
77 =item files
79 Gives a breakdown of where files are by domain and class. Displays the size of
80 all stored files pre-replication, as well as post-replication size. The latter
81 being closer to the actual storage amount. Can be very slow.
83 =item domains
85 Shows a simple count of where files are by domain and class. Faster than using
86 "files" but displays less information.
88 =item replication
90 Displays a breakdown of devcount per domain/class combo. Shows number of files
91 in domain "foo" with class "bar" that have a current devcount of 0, 1, 2, 3,
92 etc. Useful for spotting broken files (devcount 0), replication lag, or over
93 replication bugs. Can be very slow.
95 =item replication-queue
97 Quick breakdown of how many fids are due for replication. Fids listed as
98 "manual" need manual intervention before they can be replicated, and could be
99 broken. Will be fast unless there are many files in queue.
101 =item delete-queue
103 Similar to replication-queue.
105 =item general-queue
107 Displays a breakdown of what's in the general queue. This includes FSCK,
108 Rebalance, and other temporary queueing systems MogileFS has. Should be fast
109 unless you have configured MogileFS to queue many fids at once.
111 =back
113 =head1 AUTHOR
115 Dormando E<lt>L<dormando@rydia.net>E<gt>
117 =head1 BUGS
119 None known.
121 =head1 LICENSE
123 Licensed for use and redistribution under the same terms as Perl itself.
125 =cut
127 use strict;
128 use warnings;
129 use DBI;
130 use Getopt::Long;
132 # FIXME: decide how to share constants between utils and server.
133 use constant ENDOFTIME => 2147483647;
134 my %QUEUES = ( 1 => 'FSCK_QUEUE', 2 => 'REBAL_QUEUE' );
135 my %valid_stats = map { $_ => 1 } qw/devices fids files domains replication replication-queue delete-queue general-queues all/;
137 my $DBH_CACHE = '';
138 my $DB_TYPE = '';
140 my %opts;
142 # FIXME: Use MogileFS::Utils for configuration junk.
143 Getopt::Long::Configure("require_order", "pass_through");
144 GetOptions(
145 "config=s" => \$opts{config},
146 "lib=s" => \$opts{lib},
147 "help" => \$opts{help},
148 "verbose" => \$opts{verbose},
149 "stats=s" => \$opts{stats},
150 "db_dsn=s" => \$opts{db_dsn},
151 "db_user=s" => \$opts{db_user},
152 "db_pass=s" => \$opts{db_pass},
153 ) or abort_with_usage();
154 Getopt::Long::Configure("require_order", "no_pass_through");
156 my @configs = ($opts{config}, "$ENV{HOME}/.mogilefs.conf", "/etc/mogilefs/mogilefs.conf");
157 foreach my $fn (reverse @configs) {
158 next unless $fn && -e $fn;
159 open FILE, "<$fn"
160 or die "unable to open $fn: $!\n";
161 while (<FILE>) {
162 s/\s+#.*//;
163 next unless m/^\s*(\w+)\s*=\s*(.+?)\s*$/;
164 $opts{$1} = $2 unless ( defined $opts{$1} );
166 close FILE;
169 abort_with_usage() if $opts{help};
171 cmd_stats($opts{stats});
173 sub abort_with_usage {
174 my $message = shift;
175 print "ERROR: $message\n\n" if $message;
177 print qq{Usage:
178 mogstats --db_dsn="DBI:mysql:mfs:host=mfshost" --db_user="mfs" \
179 --db_pass="mfs" --verbose --stats="devices,files"
180 mogstats --stats="all"
181 mogstats [all options in ~/.mogilefs.conf]
184 print "valid stats: ", join(', ', sort keys %valid_stats), "\n";
185 exit 1;
188 sub cmd_stats {
189 my $args = shift;
190 $args = 'all' unless $args;
191 my %args = map { $_ => 1 } split(/,/, $args);
192 for my $arg (keys %args) {
193 abort_with_usage("Invalid stat $arg") unless $valid_stats{$arg};
196 print "Fetching statistics... ($args)\n";
197 my $stats = stats_from_db(\%args)
198 or die "Can't fetch stats";
200 if ($args{devices} || $args{all}) {
201 print "\nStatistics for devices...\n";
202 printf " %-10s %-15s %12s %10s\n", "device", "host", "files", "status";
203 printf " ---------- ---------------- ------------ ----------\n";
204 foreach my $device (sort { $a <=> $b } keys %{$stats->{devices}}) {
205 my $value = $stats->{devices}->{$device};
206 printf " %-10s %-10s %10s %10s\n", "dev$device", $value->{host}, $value->{files}, $value->{status};
208 printf " ---------- ---------------- ------------ ----------\n";
211 if ($args{fids} || $args{all}) {
212 print "\nStatistics for file ids...\n";
213 printf " Max file id: %s\n", $stats->{fids}->{max} || 'none';
216 if ($args{files} || $args{all}) {
217 print "\nStatistics for files...\n";
218 printf " %-20s %-10s %10s %11s %13s\n", 'domain', 'class', 'files',
219 'size (m)', 'fullsize (m)';
220 printf " -------------------- ----------- ---------- ----------- -------------\n";
221 foreach my $domain (sort keys %{$stats->{files}}) {
222 my $classes = $stats->{files}->{$domain};
223 foreach my $class (sort keys %$classes) {
224 my $stat = $classes->{$class};
225 my $files = $stat->[2];
226 my $size = int($stat->[3] / 1024 / 1024);
227 my $total = int($stat->[4] / 1024 / 1024);
228 printf " %-20s %-10s %10s %11s %13s\n", $domain, $class,
229 $files, $size, $total;
232 printf " -------------------- ----------- ---------- ----------- -------------\n";
235 if ($args{domains} && !($args{files} || $args{all})) {
236 print "\nStatistics for domains...\n";
237 printf " %-20s %-10s %10s\n", 'domain', 'class', 'files';
238 printf " -------------------- ----------- ----------\n";
239 foreach my $domain (sort keys %{$stats->{domains}}) {
240 my $classes = $stats->{domains}->{$domain};
241 foreach my $class (sort keys %$classes) {
242 my $files = $classes->{$class};
243 printf " %-20s %-10s %10s\n", $domain, $class, $files;
246 printf " -------------------- ----------- ----------\n";
249 if ($args{replication} || $args{all}) {
250 print "\nStatistics for replication...\n";
251 printf " %-20s %-10s %10s %10s\n", 'domain', 'class', 'devcount', 'files';
252 printf " -------------------- ----------- ---------- ----------\n";
253 foreach my $domain (sort keys %{$stats->{replication}}) {
254 my $classes = $stats->{replication}->{$domain};
255 foreach my $class (sort keys %$classes) {
256 my $devcounts = $classes->{$class};
257 foreach my $devcount (sort { $a <=> $b } keys %$devcounts) {
258 my $files = $devcounts->{$devcount};
259 printf " %-20s %-10s %10s %10s\n", $domain, $class, $devcount, $files;
263 printf " -------------------- ----------- ---------- ----------\n";
265 # Now new stats.
266 if ($args{'replication-queue'} || $args{all}) {
267 print "\nStatistics for replication queue...\n";
268 printf " %-20s %12s\n", 'status', 'count';
269 printf " -------------------- ------------\n";
270 for my $status (sort keys %{$stats->{to_replicate}}) {
271 my $files = $stats->{to_replicate}->{$status};
272 printf " %-20s %12s\n", $status, $files;
274 printf " -------------------- ------------\n";
278 if ($args{'delete-queue'} || $args{all}) {
279 print "\nStatistics for delete queue...\n";
280 printf " %-20s %12s\n", 'status', 'count';
281 printf " -------------------- ------------\n";
282 for my $status (sort keys %{$stats->{to_delete}}) {
283 my $files = $stats->{to_delete}->{$status};
284 printf " %-20s %12s\n", $status, $files;
286 printf " -------------------- ------------\n";
290 if ($args{'general-queues'} || $args{all}) {
291 print "\nStatistics for general queues...\n";
292 printf " %-15s %-20s %12s\n", 'queue', 'status', 'count';
293 printf " --------------- -------------------- ------------\n";
294 for my $queue (sort keys %{$stats->{queue}}) {
295 my $status = $stats->{queue}->{$queue};
296 for my $stat (sort keys %{$status}) {
297 my $files = $status->{$stat};
298 printf " %-15s %-20s %12s\n", $queue, $stat, $files;
301 printf " --------------- -------------------- ------------\n";
304 print "\ndone\n";
307 sub get_dbh {
308 return $DBH_CACHE if ($DBH_CACHE && $DBH_CACHE->ping);
309 $DBH_CACHE = DBI->connect($opts{db_dsn}, $opts{db_user}, $opts{db_pass}, {
310 PrintError => 0,
311 AutoCommit => 1,
312 RaiseError => 1,
313 }) or die "Failed to connect to database: " . DBI->errstr;
314 my $dsn = $opts{db_dsn};
315 if ($dsn =~ /^DBI:mysql:/i) {
316 $DB_TYPE = "MySQL";
317 } elsif ($dsn =~ /^DBI:SQLite:/i) {
318 $DB_TYPE = "SQLite";
319 } elsif ($dsn =~ /^DBI:Pg:/i) {
320 $DB_TYPE = "Postgres";
321 } else {
322 die "Unknown database type: $dsn";
324 return $DBH_CACHE;
327 sub stats_from_db {
328 my $args = shift;
330 # get database handle
331 my $ret = {};
332 my $dbh = get_dbh() or die "Could not get database handle";
334 # get names of all domains and classes for use later
335 my %classes;
336 my $rows;
338 $rows = $dbh->selectall_arrayref('SELECT d.dmid, d.namespace, c.classid, c.classname ' .
339 'FROM domain d LEFT JOIN class c ON c.dmid=d.dmid');
341 foreach my $row (@$rows) {
342 $classes{$row->[0]}->{name} = $row->[1];
343 $classes{$row->[0]}->{classes}->{$row->[2] || 0} = $row->[3] || 'default';
345 $classes{$_}->{classes}->{0} = 'default'
346 foreach keys %classes;
348 # get host and device information with device status
349 my %devices;
350 $rows = $dbh->selectall_arrayref('SELECT device.devid, hostname, device.status ' .
351 'FROM device, host WHERE device.hostid = host.hostid');
352 foreach my $row (@$rows) {
353 $devices{$row->[0]}->{host} = $row->[1];
354 $devices{$row->[0]}->{status} = $row->[2];
356 my %globals = ( classes => \%classes, devices => \%devices );
358 # if they want replication counts, or didn't specify what they wanted
359 if ($args->{replication} || $args->{all}) {
360 $ret->{replication} = stats_for_replication(\%globals);
363 # Stats about the replication queue (deferred, overdue)
364 if ($args->{'replication-queue'} || $args->{all}) {
365 $ret->{to_replicate} = stats_for_to_replicate(\%globals);
368 # Stats about the delete queue (deferred, overdue)
369 if ($args->{'delete-queue'} || $args->{all}) {
370 $ret->{to_delete} = stats_for_to_delete(\%globals);
373 # file statistics (how many files there are and in what domains/classes)
374 if ($args->{files} || $args->{all}) {
375 $ret->{files} = stats_for_files(\%globals);
378 # domain statistics (how many files per domain, faster than file stats)
379 if ($args->{domains} && !($args->{files} || $args->{all})) {
380 $ret->{domains} = stats_for_domains(\%globals);
383 # device statistics (how many files are on each device)
384 if ($args->{devices} || $args->{all}) {
385 $ret->{devices} = stats_for_devices(\%globals);
388 # now fid statistics
389 if ($args->{fids} || $args->{all}) {
390 verbose("... fid stats...");
391 my $max = $dbh->selectrow_array('SELECT MAX(fid) FROM file');
392 $ret->{fids} = { max => $max };
393 verbose("... done");
396 if ($args->{'general-queues'} || $args->{all}) {
397 $ret->{queue} = stats_for_to_queue(\%globals);
400 return $ret;
403 sub stats_for_devices {
404 my $globals = shift;
405 my %classes = %{$globals->{classes}};
406 my %devices = %{$globals->{devices}};
407 my $dbh = get_dbh() or die "Could not get database handle";
409 verbose("... per-device stats...");
410 my $stats = $dbh->selectall_arrayref('SELECT devid, COUNT(devid) FROM file_on GROUP BY 1');
411 my $devs = {};
412 for my $stat (@$stats) {
413 my $host = $devices{$stat->[0]}->{host};
414 my $status = $devices{$stat->[0]}->{status};
415 $devs->{$stat->[0]} = {
416 host => $host,
417 status => $status,
418 files => $stat->[1],
421 verbose("... done");
422 return $devs;
425 sub stats_for_files_sql {
426 my $sql = 'SELECT dmid, classid, COUNT(classid), sum(length), sum(length::int8 * devcount::int8) FROM file GROUP BY 1, 2';
427 return $sql if ($DB_TYPE eq 'Postgres');
428 $sql =~ s/::int8//g;
429 return $sql;
432 sub stats_for_files {
433 my $globals = shift;
434 my %classes = %{$globals->{classes}};
435 my %devices = %{$globals->{devices}};
436 my $dbh = get_dbh() or die "Could not get database handle";
438 verbose("... files stats...");
439 my $stats = $dbh->selectall_arrayref(stats_for_files_sql());
440 my $files = {};
441 for my $stat (@$stats) {
442 my $domain = $classes{$stat->[0]}->{name};
443 my $class = $classes{$stat->[0]}->{classes}->{$stat->[1]};
444 $files->{$domain}->{$class} = $stat;
446 verbose("... done");
447 return $files;
450 sub stats_for_domains {
451 my $globals = shift;
452 my %classes = %{$globals->{classes}};
453 my %devices = %{$globals->{devices}};
454 my $dbh = get_dbh() or die "Could not get database handle";
456 verbose("... domains stats...");
457 my $stats = $dbh->selectall_arrayref('SELECT dmid, classid, COUNT(classid) FROM file GROUP BY 1, 2');
458 my $files = {};
459 for my $stat (@$stats) {
460 my $domain = $classes{$stat->[0]}->{name};
461 my $class = $classes{$stat->[0]}->{classes}->{$stat->[1]};
462 $files->{$domain}->{$class} = $stat->[2];
464 verbose("... done");
465 return $files;
468 sub stats_for_replication {
469 my $globals = shift;
470 my %classes = %{$globals->{classes}};
471 my %devices = %{$globals->{devices}};
472 my $dbh = get_dbh() or die "Could not get database handle";
474 verbose("... replication stats...");
475 # replication stats
476 # This is the old version that used devcount:
477 my @stats = get_stats_files_per_devcount();
479 my $repl = {};
480 for my $stat (@stats) {
481 my $domain = $classes{$stat->{dmid}}->{name};
482 my $class =
483 $classes{$stat->{dmid}}->{classes}->{$stat->{classid}};
484 $repl->{$domain}->{$class}->{$stat->{devcount}} = $stat->{count};
487 verbose("... done");
488 return $repl;
491 sub stats_for_to_queue {
492 my $ret = {};
493 my $dbh = get_dbh() or die "Could not get database handle";
495 verbose("... queue stats...");
496 my $db_time = $dbh->selectrow_array('SELECT '. unix_timestamp());
497 my $stats = $dbh->selectall_arrayref('SELECT type, nexttry, COUNT(*) FROM file_to_queue GROUP BY 1, 2');
498 for my $stat (@$stats) {
499 my $qname = $QUEUES{$stat->[0]} || "UNKNOWN_QUEUE";
500 if ($stat->[1] < 1000) {
501 my $name = { 0 => 'new', 1 => 'redo' }->{$stat->[1]} ||
502 "unknown";
503 $ret->{$qname}->{"$name"} += $stat->[2];
504 } elsif ($stat->[1] == ENDOFTIME) {
505 $ret->{$qname}->{"manual"} = $stat->[2];
506 } elsif ($stat->[0] < $db_time) {
507 $ret->{$qname}->{"overdue"} += $stat->[2];
508 } else {
509 $ret->{$qname}->{"deferred"} += $stat->[2];
512 verbose("... done");
513 return $ret;
516 # TODO: See how much of this code is collapsable...
517 sub stats_for_to_delete {
518 my $ret = {};
519 my $dbh = get_dbh() or die "Could not get database handle";
521 verbose("... delete queue stats...");
522 my $db_time = $dbh->selectrow_array('SELECT '. unix_timestamp());
523 my $stats = $dbh->selectall_arrayref('SELECT nexttry, COUNT(*) FROM file_to_delete2 GROUP BY 1');
524 for my $stat (@$stats) {
525 if ($stat->[0] < 1000) {
526 my $name = { 0 => 'new', 1 => 'redo' }->{$stat->[0]} ||
527 "unknown";
528 $ret->{$name} += $stat->[1];
529 } elsif ($stat->[0] == ENDOFTIME) {
530 $ret->{manual} = $stat->[1];
531 } elsif ($stat->[0] < $db_time) {
532 $ret->{overdue} += $stat->[1];
533 } else {
534 $ret->{deferred} += $stat->[1];
537 verbose("... done");
538 return $ret;
541 sub stats_for_to_replicate {
542 my $ret = {};
543 my $dbh = get_dbh() or die "Could not get database handle";
545 # now we want to do the "new" replication stats
546 verbose("... replication queue stats...");
547 my $db_time = $dbh->selectrow_array('SELECT '. unix_timestamp());
548 my $stats = $dbh->selectall_arrayref('SELECT nexttry, COUNT(*) FROM file_to_replicate GROUP BY 1');
549 foreach my $stat (@$stats) {
550 if ($stat->[0] < 1000) {
551 # anything under 1000 is a specific state, so let's define those. here's the list
552 # of short names to describe them.
553 my $name = {
554 0 => 'newfile', # new files that need to be replicated
555 1 => 'redo', # files that need to go through replication again
556 }->{$stat->[0]} || "unknown";
558 # now put it in the output hashref. note that we do += because we might
559 # have more than one group of unknowns.
560 $ret->{"$name"} += $stat->[1];
562 } elsif ($stat->[0] == ENDOFTIME) {
563 $ret->{"manual"} = $stat->[1];
565 } elsif ($stat->[0] < $db_time) {
566 $ret->{"overdue"} += $stat->[1];
568 } else {
569 $ret->{"deferred"} += $stat->[1];
572 verbose("... done");
573 return $ret;
576 # FIXME: This is obviously MySQL-only.
577 sub unix_timestamp {
578 if ($DB_TYPE eq 'MySQL') {
579 return "UNIX_TIMESTAMP()";
580 } elsif ($DB_TYPE eq 'Postgres') {
581 return "EXTRACT(epoch FROM NOW())::int4";
582 } elsif ($DB_TYPE eq 'SQLite') {
583 return "strftime('%s','now')";
587 sub get_stats_files_per_devcount {
588 my $dbh = get_dbh();
589 my @ret;
590 my $sth = $dbh->prepare('SELECT dmid, classid, devcount, COUNT(devcount) AS "count" FROM file GROUP BY 1, 2, 3');
591 $sth->execute;
592 while (my $row = $sth->fetchrow_hashref) {
593 push @ret, $row;
595 return @ret;
598 sub verbose {
599 print $_[0], "\n" if $opts{verbose};