diff options
Diffstat (limited to 'contrib/stats')
-rwxr-xr-x | contrib/stats/git-common-hash | 26 | ||||
-rwxr-xr-x | contrib/stats/mailmap.pl | 38 | ||||
-rwxr-xr-x | contrib/stats/packinfo.pl | 212 |
3 files changed, 276 insertions, 0 deletions
diff --git a/contrib/stats/git-common-hash b/contrib/stats/git-common-hash new file mode 100755 index 0000000000..e27fd088be --- /dev/null +++ b/contrib/stats/git-common-hash @@ -0,0 +1,26 @@ +#!/bin/sh + +# This script displays the distribution of longest common hash prefixes. +# This can be used to determine the minimum prefix length to use +# for object names to be unique. + +git rev-list --objects --all | sort | perl -lne ' + substr($_, 40) = ""; + # uncomment next line for a distribution of bits instead of hex chars + # $_ = unpack("B*",pack("H*",$_)); + if (defined $p) { + ($p ^ $_) =~ /^(\0*)/; + $common = length $1; + if (defined $pcommon) { + $count[$pcommon > $common ? $pcommon : $common]++; + } else { + $count[$common]++; # first item + } + } + $p = $_; + $pcommon = $common; + END { + $count[$common]++; # last item + print "$_: $count[$_]" for 0..$#count; + } +' diff --git a/contrib/stats/mailmap.pl b/contrib/stats/mailmap.pl new file mode 100755 index 0000000000..4b852e2455 --- /dev/null +++ b/contrib/stats/mailmap.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w +my %mailmap = (); +open I, "<", ".mailmap"; +while (<I>) { + chomp; + next if /^#/; + if (my ($author, $mail) = /^(.*?)\s+<(.+)>$/) { + $mailmap{$mail} = $author; + } +} +close I; + +my %mail2author = (); +open I, "git log --pretty='format:%ae %an' |"; +while (<I>) { + chomp; + my ($mail, $author) = split(/\t/, $_); + next if exists $mailmap{$mail}; + $mail2author{$mail} ||= {}; + $mail2author{$mail}{$author} ||= 0; + $mail2author{$mail}{$author}++; +} +close I; + +while (my ($mail, $authorcount) = each %mail2author) { + # %$authorcount is ($author => $count); + # sort and show the names from the most frequent ones. + my @names = (map { $_->[0] } + sort { $b->[1] <=> $a->[1] } + map { [$_, $authorcount->{$_}] } + keys %$authorcount); + if (1 < @names) { + for (@names) { + print "$_ <$mail>\n"; + } + } +} + diff --git a/contrib/stats/packinfo.pl b/contrib/stats/packinfo.pl new file mode 100755 index 0000000000..be188c0f11 --- /dev/null +++ b/contrib/stats/packinfo.pl @@ -0,0 +1,212 @@ +#!/usr/bin/perl +# +# This tool will print vaguely pretty information about a pack. It +# expects the output of "git verify-pack -v" as input on stdin. +# +# $ git verify-pack -v | packinfo.pl +# +# This prints some full-pack statistics; currently "all sizes", "all +# path sizes", "tree sizes", "tree path sizes", and "depths". +# +# * "all sizes" stats are across every object size in the file; +# full sizes for base objects, and delta size for deltas. +# * "all path sizes" stats are across all object's "path sizes". +# A path size is the sum of the size of the delta chain, including the +# base object. In other words, it's how many bytes need be read to +# reassemble the file from deltas. +# * "tree sizes" are object sizes grouped into delta trees. +# * "tree path sizes" are path sizes grouped into delta trees. +# * "depths" should be obvious. +# +# When run as: +# +# $ git verify-pack -v | packinfo.pl -tree +# +# the trees of objects are output along with the stats. This looks +# like: +# +# 0 commit 031321c6... 803 803 +# +# 0 blob 03156f21... 1767 1767 +# 1 blob f52a9d7f... 10 1777 +# 2 blob a8cc5739... 51 1828 +# 3 blob 660e90b1... 15 1843 +# 4 blob 0cb8e3bb... 33 1876 +# 2 blob e48607f0... 311 2088 +# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 +# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 +# +# The first number after the sha1 is the object size, the second +# number is the path size. The statistics are across all objects in +# the previous delta tree. Obviously they are omitted for trees of +# one object. +# +# When run as: +# +# $ git verify-pack -v | packinfo.pl -tree -filenames +# +# it adds filenames to the tree. Getting this information is slow: +# +# 0 blob 03156f21... 1767 1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142 +# 1 blob f52a9d7f... 10 1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74 +# 2 blob a8cc5739... 51 1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0 +# 3 blob 660e90b1... 15 1843 Documentation/git-lost+found.txt @ master~3222^2~2 +# 4 blob 0cb8e3bb... 33 1876 Documentation/git-lost+found.txt @ master~3222^2~3 +# 2 blob e48607f0... 311 2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4 +# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 +# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 +# +# When run as: +# +# $ git verify-pack -v | packinfo.pl -dump +# +# it prints out "sha1 size pathsize depth" for each sha1 in lexical +# order. +# +# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7 +# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4 +# 000182eacf99cde27d5916aa415921924b82972c 499 499 0 +# ... +# +# This is handy for comparing two packs. Adding "-filenames" will add +# filenames, as per "-tree -filenames" above. + +use strict; +use Getopt::Long; + +my $filenames = 0; +my $tree = 0; +my $dump = 0; +GetOptions("tree" => \$tree, + "filenames" => \$filenames, + "dump" => \$dump); + +my %parents; +my %children; +my %sizes; +my @roots; +my %paths; +my %types; +my @commits; +my %names; +my %depths; +my @depths; + +while (<STDIN>) { + my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_); + next unless ($sha1 =~ /^[0-9a-f]{40}$/); + $depths{$sha1} = $depth || 0; + push(@depths, $depth || 0); + push(@commits, $sha1) if ($type eq 'commit'); + push(@roots, $sha1) unless $parent; + $parents{$sha1} = $parent; + $types{$sha1} = $type; + push(@{$children{$parent}}, $sha1); + $sizes{$sha1} = $size; +} + +if ($filenames && ($tree || $dump)) { + open(NAMES, "git name-rev --all|"); + while (<NAMES>) { + if (/^(\S+)\s+(.*)$/) { + my ($sha1, $name) = ($1, $2); + $names{$sha1} = $name; + } + } + close NAMES; + + for my $commit (@commits) { + my $name = $names{$commit}; + open(TREE, "git ls-tree -t -r $commit|"); + print STDERR "Plumbing tree $name\n"; + while (<TREE>) { + if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) { + my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4); + $paths{$sha1} = "$path @ $name"; + } + } + close TREE; + } +} + +sub stats { + my @data = sort {$a <=> $b} @_; + my $min = $data[0]; + my $max = $data[$#data]; + my $total = 0; + my $count = scalar @data; + for my $datum (@data) { + $total += $datum; + } + my $mean = $total / $count; + my $median = $data[int(@data / 2)]; + my $diff_sum = 0; + for my $datum (@data) { + $diff_sum += ($datum - $mean)**2; + } + my $std_dev = sqrt($diff_sum / $count); + return ($count, $total, $min, $max, $mean, $median, $std_dev); +} + +sub print_stats { + my $name = shift; + my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_); + printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n", + $name, $count, $total, $min, $max, $mean, $median, $std_dev); +} + +my @sizes; +my @path_sizes; +my @all_sizes; +my @all_path_sizes; +my %path_sizes; + +sub dig { + my ($sha1, $depth, $path_size) = @_; + $path_size += $sizes{$sha1}; + push(@sizes, $sizes{$sha1}); + push(@all_sizes, $sizes{$sha1}); + push(@path_sizes, $path_size); + push(@all_path_sizes, $path_size); + $path_sizes{$sha1} = $path_size; + if ($tree) { + printf("%3d%s %6s %s %8d %8d %s\n", + $depth, (" " x $depth), $types{$sha1}, + $sha1, $sizes{$sha1}, $path_size, $paths{$sha1}); + } + for my $child (@{$children{$sha1}}) { + dig($child, $depth + 1, $path_size); + } +} + +my @tree_sizes; +my @tree_path_sizes; + +for my $root (@roots) { + undef @sizes; + undef @path_sizes; + dig($root, 0, 0); + my ($aa, $sz_total) = stats(@sizes); + my ($bb, $psz_total) = stats(@path_sizes); + push(@tree_sizes, $sz_total); + push(@tree_path_sizes, $psz_total); + if ($tree) { + if (@sizes > 1) { + print_stats(" size", @sizes); + print_stats("path size", @path_sizes); + } + print "\n"; + } +} + +if ($dump) { + for my $sha1 (sort keys %sizes) { + print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n"; + } +} else { + print_stats(" all sizes", @all_sizes); + print_stats(" all path sizes", @all_path_sizes); + print_stats(" tree sizes", @tree_sizes); + print_stats("tree path sizes", @tree_path_sizes); + print_stats(" depths", @depths); +} |