contrib: update stats/mailmap script

This version changes quite a few things: 1. The original parsed the mailmap file itself, and it did it wrong (it did not understand entries with an extra email key). Instead, this version uses git's "%aE" and "%aN" formats to have git perform the mapping, meaning we do not have to read .mailmap at all, but still operate on the current state that git sees (and it also works properly from subdirs). 2. The original would find multiple names for an email, but not the other way around. This version can do either or both. If we find multiple emails for a name, the resolution is less obvious than the other way around. However, it can still be a starting point for a human to investigate. 3. The original would order only by count, not by recency. This version can do either. Combined with showing the counts, it can be easier to decide how to resolve. 4. This version shows similar entries in a blank-delimited stanza, which makes it more clear which options you are picking from. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
author: Jeff King <peff@peff.net> 2012-12-12 06:41:41 -0500
committer: Junio C Hamano <gitster@pobox.com> 2012-12-12 11:09:11 -0800
commit: 53474eb92ff0571e0b1eacd88d638692b96a2018 (patch)
tree: a02443524dbe364d445fe6814c9ae7cb445d015a /contrib/stats
parent: .mailmap: normalize emails for Linus Torvalds (diff)
download: tgif-53474eb92ff0571e0b1eacd88d638692b96a2018.tar.xz
1 files changed, 64 insertions, 32 deletions
diff --git a/contrib/stats/mailmap.pl b/contrib/stats/mailmap.pl
index 4b852e2455..9513f5e35b 100755
--- a/contrib/stats/mailmap.pl
+++ b/contrib/stats/mailmap.pl
@@ -1,38 +1,70 @@
-#!/usr/bin/perl -w
-my %mailmap = ();
-open I, "<", ".mailmap";
-while (<I>) {
-	chomp;
-	next if /^#/;
-	if (my ($author, $mail) = /^(.*?)\s+<(.+)>$/) {
-		$mailmap{$mail} = $author;
-	}
+#!/usr/bin/perl
+
+use warnings 'all';
+use strict;
+use Getopt::Long;
+
+my $match_emails;
+my $match_names;
+my $order_by = 'count';
+Getopt::Long::Configure(qw(bundling));
+GetOptions(
+	'emails|e!' => \$match_emails,
+	'names|n!'  => \$match_names,
+	'count|c'   => sub { $order_by = 'count' },
+	'time|t'    => sub { $order_by = 'stamp' },
+) or exit 1;
+$match_emails = 1 unless $match_names;
+
+my $email = {};
+my $name = {};
+
+open(my $fh, '-|', "git log --format='%at <%aE> %aN'");
+while(<$fh>) {
+	my ($t, $e, $n) = /(\S+) <(\S+)> (.*)/;
+	mark($email, $e, $n, $t);
+	mark($name, $n, $e, $t);
 }
-close I;
-
-my %mail2author = ();
-open I, "git log --pretty='format:%ae	%an' |";
-while (<I>) {
-	chomp;
-	my ($mail, $author) = split(/\t/, $_);
-	next if exists $mailmap{$mail};
-	$mail2author{$mail} ||= {};
-	$mail2author{$mail}{$author} ||= 0;
-	$mail2author{$mail}{$author}++;
+close($fh);
+
+if ($match_emails) {
+	foreach my $e (dups($email)) {
+		foreach my $n (vals($email->{$e})) {
+			show($n, $e, $email->{$e}->{$n});
+		}
+		print "\n";
+	}
 }
-close I;
-
-while (my ($mail, $authorcount) = each %mail2author) {
-	# %$authorcount is ($author => $count);
-	# sort and show the names from the most frequent ones.
-	my @names = (map { $_->[0] }
-		sort { $b->[1] <=> $a->[1] }
-		map { [$_, $authorcount->{$_}] }
-		keys %$authorcount);
-	if (1 < @names) {
-		for (@names) {
-			print "$_ <$mail>\n";
+if ($match_names) {
+	foreach my $n (dups($name)) {
+		foreach my $e (vals($name->{$n})) {
+			show($n, $e, $name->{$n}->{$e});
 		}
+		print "\n";
 	}
 }
+exit 0;
 
+sub mark {
+	my ($h, $k, $v, $t) = @_;
+	my $e = $h->{$k}->{$v} ||= { count => 0, stamp => 0 };
+	$e->{count}++;
+	$e->{stamp} = $t unless $t < $e->{stamp};
+}
+
+sub dups {
+	my $h = shift;
+	return grep { keys($h->{$_}) > 1 } keys($h);
+}
+
+sub vals {
+	my $h = shift;
+	return sort {
+		$h->{$b}->{$order_by} <=> $h->{$a}->{$order_by}
+	} keys($h);
+}
+
+sub show {
+	my ($n, $e, $h) = @_;
+	print "$n <$e> ($h->{$order_by})\n";
+}
author	Jeff King <peff@peff.net>	2012-12-12 06:41:41 -0500
committer	Junio C Hamano <gitster@pobox.com>	2012-12-12 11:09:11 -0800
commit	53474eb92ff0571e0b1eacd88d638692b96a2018 (patch)
tree	a02443524dbe364d445fe6814c9ae7cb445d015a /contrib/stats
parent	.mailmap: normalize emails for Linus Torvalds (diff)
download	tgif-53474eb92ff0571e0b1eacd88d638692b96a2018.tar.xz