diff options
author | Lars Schneider <larsxschneider@gmail.com> | 2016-10-16 16:20:37 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2016-10-17 11:45:52 -0700 |
commit | edcc85814c87ebd7f3b1b7d3979fac3dfb84d308 (patch) | |
tree | 92fcb94be20c944c989b91d6cc8b4a0f54af2a01 /t/t0021 | |
parent | convert: prepare filter.<driver>.process option (diff) | |
download | tgif-edcc85814c87ebd7f3b1b7d3979fac3dfb84d308.tar.xz |
convert: add filter.<driver>.process option
Git's clean/smudge mechanism invokes an external filter process for
every single blob that is affected by a filter. If Git filters a lot of
blobs then the startup time of the external filter processes can become
a significant part of the overall Git execution time.
In a preliminary performance test this developer used a clean/smudge
filter written in golang to filter 12,000 files. This process took 364s
with the existing filter mechanism and 5s with the new mechanism. See
details here: https://github.com/github/git-lfs/pull/1382
This patch adds the `filter.<driver>.process` string option which, if
used, keeps the external filter process running and processes all blobs
with the packet format (pkt-line) based protocol over standard input and
standard output. The full protocol is explained in detail in
`Documentation/gitattributes.txt`.
A few key decisions:
* The long running filter process is referred to as filter protocol
version 2 because the existing single shot filter invocation is
considered version 1.
* Git sends a welcome message and expects a response right after the
external filter process has started. This ensures that Git will not
hang if a version 1 filter is incorrectly used with the
filter.<driver>.process option for version 2 filters. In addition,
Git can detect this kind of error and warn the user.
* The status of a filter operation (e.g. "success" or "error) is set
before the actual response and (if necessary!) re-set after the
response. The advantage of this two step status response is that if
the filter detects an error early, then the filter can communicate
this and Git does not even need to create structures to read the
response.
* All status responses are pkt-line lists terminated with a flush
packet. This allows us to send other status fields with the same
protocol in the future.
Helped-by: Martin-Louis Bright <mlbright@gmail.com>
Reviewed-by: Jakub Narebski <jnareb@gmail.com>
Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 't/t0021')
-rwxr-xr-x | t/t0021/rot13-filter.pl | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/t/t0021/rot13-filter.pl b/t/t0021/rot13-filter.pl new file mode 100755 index 0000000000..ae4c50f5c5 --- /dev/null +++ b/t/t0021/rot13-filter.pl @@ -0,0 +1,192 @@ +#!/usr/bin/perl +# +# Example implementation for the Git filter protocol version 2 +# See Documentation/gitattributes.txt, section "Filter Protocol" +# +# The script takes the list of supported protocol capabilities as +# arguments ("clean", "smudge", etc). +# +# This implementation supports special test cases: +# (1) If data with the pathname "clean-write-fail.r" is processed with +# a "clean" operation then the write operation will die. +# (2) If data with the pathname "smudge-write-fail.r" is processed with +# a "smudge" operation then the write operation will die. +# (3) If data with the pathname "error.r" is processed with any +# operation then the filter signals that it cannot or does not want +# to process the file. +# (4) If data with the pathname "abort.r" is processed with any +# operation then the filter signals that it cannot or does not want +# to process the file and any file after that is processed with the +# same command. +# + +use strict; +use warnings; + +my $MAX_PACKET_CONTENT_SIZE = 65516; +my @capabilities = @ARGV; + +open my $debug, ">>", "rot13-filter.log" or die "cannot open log file: $!"; + +sub rot13 { + my $str = shift; + $str =~ y/A-Za-z/N-ZA-Mn-za-m/; + return $str; +} + +sub packet_bin_read { + my $buffer; + my $bytes_read = read STDIN, $buffer, 4; + if ( $bytes_read == 0 ) { + # EOF - Git stopped talking to us! + print $debug "STOP\n"; + exit(); + } + elsif ( $bytes_read != 4 ) { + die "invalid packet: '$buffer'"; + } + my $pkt_size = hex($buffer); + if ( $pkt_size == 0 ) { + return ( 1, "" ); + } + elsif ( $pkt_size > 4 ) { + my $content_size = $pkt_size - 4; + $bytes_read = read STDIN, $buffer, $content_size; + if ( $bytes_read != $content_size ) { + die "invalid packet ($content_size bytes expected; $bytes_read bytes read)"; + } + return ( 0, $buffer ); + } + else { + die "invalid packet size: $pkt_size"; + } +} + +sub packet_txt_read { + my ( $res, $buf ) = packet_bin_read(); + unless ( $buf =~ s/\n$// ) { + die "A non-binary line MUST be terminated by an LF."; + } + return ( $res, $buf ); +} + +sub packet_bin_write { + my $buf = shift; + print STDOUT sprintf( "%04x", length($buf) + 4 ); + print STDOUT $buf; + STDOUT->flush(); +} + +sub packet_txt_write { + packet_bin_write( $_[0] . "\n" ); +} + +sub packet_flush { + print STDOUT sprintf( "%04x", 0 ); + STDOUT->flush(); +} + +print $debug "START\n"; +$debug->flush(); + +( packet_txt_read() eq ( 0, "git-filter-client" ) ) || die "bad initialize"; +( packet_txt_read() eq ( 0, "version=2" ) ) || die "bad version"; +( packet_bin_read() eq ( 1, "" ) ) || die "bad version end"; + +packet_txt_write("git-filter-server"); +packet_txt_write("version=2"); +packet_flush(); + +( packet_txt_read() eq ( 0, "capability=clean" ) ) || die "bad capability"; +( packet_txt_read() eq ( 0, "capability=smudge" ) ) || die "bad capability"; +( packet_bin_read() eq ( 1, "" ) ) || die "bad capability end"; + +foreach (@capabilities) { + packet_txt_write( "capability=" . $_ ); +} +packet_flush(); +print $debug "init handshake complete\n"; +$debug->flush(); + +while (1) { + my ($command) = packet_txt_read() =~ /^command=([^=]+)$/; + print $debug "IN: $command"; + $debug->flush(); + + my ($pathname) = packet_txt_read() =~ /^pathname=([^=]+)$/; + print $debug " $pathname"; + $debug->flush(); + + # Flush + packet_bin_read(); + + my $input = ""; + { + binmode(STDIN); + my $buffer; + my $done = 0; + while ( !$done ) { + ( $done, $buffer ) = packet_bin_read(); + $input .= $buffer; + } + print $debug " " . length($input) . " [OK] -- "; + $debug->flush(); + } + + my $output; + if ( $pathname eq "error.r" or $pathname eq "abort.r" ) { + $output = ""; + } + elsif ( $command eq "clean" and grep( /^clean$/, @capabilities ) ) { + $output = rot13($input); + } + elsif ( $command eq "smudge" and grep( /^smudge$/, @capabilities ) ) { + $output = rot13($input); + } + else { + die "bad command '$command'"; + } + + print $debug "OUT: " . length($output) . " "; + $debug->flush(); + + if ( $pathname eq "error.r" ) { + print $debug "[ERROR]\n"; + $debug->flush(); + packet_txt_write("status=error"); + packet_flush(); + } + elsif ( $pathname eq "abort.r" ) { + print $debug "[ABORT]\n"; + $debug->flush(); + packet_txt_write("status=abort"); + packet_flush(); + } + else { + packet_txt_write("status=success"); + packet_flush(); + + if ( $pathname eq "${command}-write-fail.r" ) { + print $debug "[WRITE FAIL]\n"; + $debug->flush(); + die "${command} write error"; + } + + while ( length($output) > 0 ) { + my $packet = substr( $output, 0, $MAX_PACKET_CONTENT_SIZE ); + packet_bin_write($packet); + # dots represent the number of packets + print $debug "."; + if ( length($output) > $MAX_PACKET_CONTENT_SIZE ) { + $output = substr( $output, $MAX_PACKET_CONTENT_SIZE ); + } + else { + $output = ""; + } + } + packet_flush(); + print $debug " [OK]\n"; + $debug->flush(); + packet_flush(); + } +} |