git-remote-mediawiki: get rid of O(N^2) loop
[git/gitweb.git] / contrib / stats / packinfo.pl
blobbe188c0f11dbea8320737b4fdf426a2f5acd1a00
1 #!/usr/bin/perl
3 # This tool will print vaguely pretty information about a pack. It
4 # expects the output of "git verify-pack -v" as input on stdin.
6 # $ git verify-pack -v | packinfo.pl
8 # This prints some full-pack statistics; currently "all sizes", "all
9 # path sizes", "tree sizes", "tree path sizes", and "depths".
11 # * "all sizes" stats are across every object size in the file;
12 # full sizes for base objects, and delta size for deltas.
13 # * "all path sizes" stats are across all object's "path sizes".
14 # A path size is the sum of the size of the delta chain, including the
15 # base object. In other words, it's how many bytes need be read to
16 # reassemble the file from deltas.
17 # * "tree sizes" are object sizes grouped into delta trees.
18 # * "tree path sizes" are path sizes grouped into delta trees.
19 # * "depths" should be obvious.
21 # When run as:
23 # $ git verify-pack -v | packinfo.pl -tree
25 # the trees of objects are output along with the stats. This looks
26 # like:
28 # 0 commit 031321c6... 803 803
30 # 0 blob 03156f21... 1767 1767
31 # 1 blob f52a9d7f... 10 1777
32 # 2 blob a8cc5739... 51 1828
33 # 3 blob 660e90b1... 15 1843
34 # 4 blob 0cb8e3bb... 33 1876
35 # 2 blob e48607f0... 311 2088
36 # size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
37 # path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
39 # The first number after the sha1 is the object size, the second
40 # number is the path size. The statistics are across all objects in
41 # the previous delta tree. Obviously they are omitted for trees of
42 # one object.
44 # When run as:
46 # $ git verify-pack -v | packinfo.pl -tree -filenames
48 # it adds filenames to the tree. Getting this information is slow:
50 # 0 blob 03156f21... 1767 1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142
51 # 1 blob f52a9d7f... 10 1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74
52 # 2 blob a8cc5739... 51 1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0
53 # 3 blob 660e90b1... 15 1843 Documentation/git-lost+found.txt @ master~3222^2~2
54 # 4 blob 0cb8e3bb... 33 1876 Documentation/git-lost+found.txt @ master~3222^2~3
55 # 2 blob e48607f0... 311 2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4
56 # size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
57 # path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
59 # When run as:
61 # $ git verify-pack -v | packinfo.pl -dump
63 # it prints out "sha1 size pathsize depth" for each sha1 in lexical
64 # order.
66 # 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7
67 # 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4
68 # 000182eacf99cde27d5916aa415921924b82972c 499 499 0
69 # ...
71 # This is handy for comparing two packs. Adding "-filenames" will add
72 # filenames, as per "-tree -filenames" above.
74 use strict;
75 use Getopt::Long;
77 my $filenames = 0;
78 my $tree = 0;
79 my $dump = 0;
80 GetOptions("tree" => \$tree,
81 "filenames" => \$filenames,
82 "dump" => \$dump);
84 my %parents;
85 my %children;
86 my %sizes;
87 my @roots;
88 my %paths;
89 my %types;
90 my @commits;
91 my %names;
92 my %depths;
93 my @depths;
95 while (<STDIN>) {
96 my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_);
97 next unless ($sha1 =~ /^[0-9a-f]{40}$/);
98 $depths{$sha1} = $depth || 0;
99 push(@depths, $depth || 0);
100 push(@commits, $sha1) if ($type eq 'commit');
101 push(@roots, $sha1) unless $parent;
102 $parents{$sha1} = $parent;
103 $types{$sha1} = $type;
104 push(@{$children{$parent}}, $sha1);
105 $sizes{$sha1} = $size;
108 if ($filenames && ($tree || $dump)) {
109 open(NAMES, "git name-rev --all|");
110 while (<NAMES>) {
111 if (/^(\S+)\s+(.*)$/) {
112 my ($sha1, $name) = ($1, $2);
113 $names{$sha1} = $name;
116 close NAMES;
118 for my $commit (@commits) {
119 my $name = $names{$commit};
120 open(TREE, "git ls-tree -t -r $commit|");
121 print STDERR "Plumbing tree $name\n";
122 while (<TREE>) {
123 if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) {
124 my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4);
125 $paths{$sha1} = "$path @ $name";
128 close TREE;
132 sub stats {
133 my @data = sort {$a <=> $b} @_;
134 my $min = $data[0];
135 my $max = $data[$#data];
136 my $total = 0;
137 my $count = scalar @data;
138 for my $datum (@data) {
139 $total += $datum;
141 my $mean = $total / $count;
142 my $median = $data[int(@data / 2)];
143 my $diff_sum = 0;
144 for my $datum (@data) {
145 $diff_sum += ($datum - $mean)**2;
147 my $std_dev = sqrt($diff_sum / $count);
148 return ($count, $total, $min, $max, $mean, $median, $std_dev);
151 sub print_stats {
152 my $name = shift;
153 my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_);
154 printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n",
155 $name, $count, $total, $min, $max, $mean, $median, $std_dev);
158 my @sizes;
159 my @path_sizes;
160 my @all_sizes;
161 my @all_path_sizes;
162 my %path_sizes;
164 sub dig {
165 my ($sha1, $depth, $path_size) = @_;
166 $path_size += $sizes{$sha1};
167 push(@sizes, $sizes{$sha1});
168 push(@all_sizes, $sizes{$sha1});
169 push(@path_sizes, $path_size);
170 push(@all_path_sizes, $path_size);
171 $path_sizes{$sha1} = $path_size;
172 if ($tree) {
173 printf("%3d%s %6s %s %8d %8d %s\n",
174 $depth, (" " x $depth), $types{$sha1},
175 $sha1, $sizes{$sha1}, $path_size, $paths{$sha1});
177 for my $child (@{$children{$sha1}}) {
178 dig($child, $depth + 1, $path_size);
182 my @tree_sizes;
183 my @tree_path_sizes;
185 for my $root (@roots) {
186 undef @sizes;
187 undef @path_sizes;
188 dig($root, 0, 0);
189 my ($aa, $sz_total) = stats(@sizes);
190 my ($bb, $psz_total) = stats(@path_sizes);
191 push(@tree_sizes, $sz_total);
192 push(@tree_path_sizes, $psz_total);
193 if ($tree) {
194 if (@sizes > 1) {
195 print_stats(" size", @sizes);
196 print_stats("path size", @path_sizes);
198 print "\n";
202 if ($dump) {
203 for my $sha1 (sort keys %sizes) {
204 print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n";
206 } else {
207 print_stats(" all sizes", @all_sizes);
208 print_stats(" all path sizes", @all_path_sizes);
209 print_stats(" tree sizes", @tree_sizes);
210 print_stats("tree path sizes", @tree_path_sizes);
211 print_stats(" depths", @depths);