diff options
-rw-r--r-- | Documentation/git-cat-file.txt | 79 | ||||
-rw-r--r-- | builtin/cat-file.c | 210 | ||||
-rw-r--r-- | cache.h | 1 | ||||
-rw-r--r-- | pack-revindex.c | 108 | ||||
-rw-r--r-- | sha1_file.c | 22 | ||||
-rw-r--r-- | streaming.c | 2 | ||||
-rwxr-xr-x | t/t1006-cat-file.sh | 74 |
7 files changed, 390 insertions, 106 deletions
diff --git a/Documentation/git-cat-file.txt b/Documentation/git-cat-file.txt index 30d585af5d..3ddec0b65b 100644 --- a/Documentation/git-cat-file.txt +++ b/Documentation/git-cat-file.txt @@ -58,12 +58,16 @@ OPTIONS to apply the filter to the content recorded in the index at <path>. --batch:: - Print the SHA-1, type, size, and contents of each object provided on - stdin. May not be combined with any other options or arguments. +--batch=<format>:: + Print object information and contents for each object provided + on stdin. May not be combined with any other options or arguments. + See the section `BATCH OUTPUT` below for details. --batch-check:: - Print the SHA-1, type, and size of each object provided on stdin. May not - be combined with any other options or arguments. +--batch-check=<format>:: + Print object information for each object provided on stdin. May + not be combined with any other options or arguments. See the + section `BATCH OUTPUT` below for details. OUTPUT ------ @@ -78,28 +82,81 @@ If '-p' is specified, the contents of <object> are pretty-printed. If <type> is specified, the raw (though uncompressed) contents of the <object> will be returned. -If '--batch' is specified, output of the following form is printed for each -object specified on stdin: +BATCH OUTPUT +------------ + +If `--batch` or `--batch-check` is given, `cat-file` will read objects +from stdin, one per line, and print information about them. + +Each line is split at the first whitespace boundary. All characters +before that whitespace are considered as a whole object name, and are +parsed as if given to linkgit:git-rev-parse[1]. Characters after that +whitespace can be accessed using the `%(rest)` atom (see below). + +You can specify the information shown for each object by using a custom +`<format>`. The `<format>` is copied literally to stdout for each +object, with placeholders of the form `%(atom)` expanded, followed by a +newline. The available atoms are: + +`objectname`:: + The 40-hex object name of the object. + +`objecttype`:: + The type of of the object (the same as `cat-file -t` reports). + +`objectsize`:: + The size, in bytes, of the object (the same as `cat-file -s` + reports). + +`objectsize:disk`:: + The size, in bytes, that the object takes up on disk. See the + note about on-disk sizes in the `CAVEATS` section below. + +`rest`:: + The text (if any) found after the first run of whitespace on the + input line (i.e., the "rest" of the line). + +If no format is specified, the default format is `%(objectname) +%(objecttype) %(objectsize)`. + +If `--batch` is specified, the object information is followed by the +object contents (consisting of `%(objectsize)` bytes), followed by a +newline. + +For example, `--batch` without a custom format would produce: ------------ <sha1> SP <type> SP <size> LF <contents> LF ------------ -If '--batch-check' is specified, output of the following form is printed for -each object specified on stdin: +Whereas `--batch-check='%(objectname) %(objecttype)'` would produce: ------------ -<sha1> SP <type> SP <size> LF +<sha1> SP <type> LF ------------ -For both '--batch' and '--batch-check', output of the following form is printed -for each object specified on stdin that does not exist in the repository: +If a name is specified on stdin that cannot be resolved to an object in +the repository, then `cat-file` will ignore any custom format and print: ------------ <object> SP missing LF ------------ + +CAVEATS +------- + +Note that the sizes of objects on disk are reported accurately, but care +should be taken in drawing conclusions about which refs or objects are +responsible for disk usage. The size of a packed non-delta object may be +much larger than the size of objects which delta against it, but the +choice of which object is the base and which is the delta is arbitrary +and is subject to change during a repack. Note also that multiple copies +of an object may be present in the object database; in this case, it is +undefined which copy's size will be reported. + + GIT --- Part of the linkgit:git[1] suite diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 045cee7bce..0e64b4159c 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -13,9 +13,6 @@ #include "userdiff.h" #include "streaming.h" -#define BATCH 1 -#define BATCH_CHECK 2 - static int cat_one_file(int opt, const char *exp_type, const char *obj_name) { unsigned char sha1[20]; @@ -117,54 +114,174 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name) return 0; } -static int batch_one_object(const char *obj_name, int print_contents) -{ +struct expand_data { unsigned char sha1[20]; - enum object_type type = 0; + enum object_type type; unsigned long size; - void *contents = NULL; + unsigned long disk_size; + const char *rest; + + /* + * If mark_query is true, we do not expand anything, but rather + * just mark the object_info with items we wish to query. + */ + int mark_query; + + /* + * After a mark_query run, this object_info is set up to be + * passed to sha1_object_info_extended. It will point to the data + * elements above, so you can retrieve the response from there. + */ + struct object_info info; +}; + +static int is_atom(const char *atom, const char *s, int slen) +{ + int alen = strlen(atom); + return alen == slen && !memcmp(atom, s, alen); +} + +static void expand_atom(struct strbuf *sb, const char *atom, int len, + void *vdata) +{ + struct expand_data *data = vdata; + + if (is_atom("objectname", atom, len)) { + if (!data->mark_query) + strbuf_addstr(sb, sha1_to_hex(data->sha1)); + } else if (is_atom("objecttype", atom, len)) { + if (!data->mark_query) + strbuf_addstr(sb, typename(data->type)); + } else if (is_atom("objectsize", atom, len)) { + if (data->mark_query) + data->info.sizep = &data->size; + else + strbuf_addf(sb, "%lu", data->size); + } else if (is_atom("objectsize:disk", atom, len)) { + if (data->mark_query) + data->info.disk_sizep = &data->disk_size; + else + strbuf_addf(sb, "%lu", data->disk_size); + } else if (is_atom("rest", atom, len)) { + if (!data->mark_query && data->rest) + strbuf_addstr(sb, data->rest); + } else + die("unknown format element: %.*s", len, atom); +} + +static size_t expand_format(struct strbuf *sb, const char *start, void *data) +{ + const char *end; + + if (*start != '(') + return 0; + end = strchr(start + 1, ')'); + if (!end) + die("format element '%s' does not end in ')'", start); + + expand_atom(sb, start + 1, end - start - 1, data); + + return end - start + 1; +} + +static void print_object_or_die(int fd, const unsigned char *sha1, + enum object_type type, unsigned long size) +{ + if (type == OBJ_BLOB) { + if (stream_blob_to_fd(fd, sha1, NULL, 0) < 0) + die("unable to stream %s to stdout", sha1_to_hex(sha1)); + } + else { + enum object_type rtype; + unsigned long rsize; + void *contents; + + contents = read_sha1_file(sha1, &rtype, &rsize); + if (!contents) + die("object %s disappeared", sha1_to_hex(sha1)); + if (rtype != type) + die("object %s changed type!?", sha1_to_hex(sha1)); + if (rsize != size) + die("object %s change size!?", sha1_to_hex(sha1)); + + write_or_die(fd, contents, size); + free(contents); + } +} + +struct batch_options { + int enabled; + int print_contents; + const char *format; +}; + +static int batch_one_object(const char *obj_name, struct batch_options *opt, + struct expand_data *data) +{ + struct strbuf buf = STRBUF_INIT; if (!obj_name) return 1; - if (get_sha1(obj_name, sha1)) { + if (get_sha1(obj_name, data->sha1)) { printf("%s missing\n", obj_name); fflush(stdout); return 0; } - if (print_contents == BATCH) - contents = read_sha1_file(sha1, &type, &size); - else - type = sha1_object_info(sha1, &size); - - if (type <= 0) { + data->type = sha1_object_info_extended(data->sha1, &data->info); + if (data->type <= 0) { printf("%s missing\n", obj_name); fflush(stdout); - if (print_contents == BATCH) - free(contents); return 0; } - printf("%s %s %lu\n", sha1_to_hex(sha1), typename(type), size); - fflush(stdout); + strbuf_expand(&buf, opt->format, expand_format, data); + strbuf_addch(&buf, '\n'); + write_or_die(1, buf.buf, buf.len); + strbuf_release(&buf); - if (print_contents == BATCH) { - write_or_die(1, contents, size); - printf("\n"); - fflush(stdout); - free(contents); + if (opt->print_contents) { + print_object_or_die(1, data->sha1, data->type, data->size); + write_or_die(1, "\n", 1); } - return 0; } -static int batch_objects(int print_contents) +static int batch_objects(struct batch_options *opt) { struct strbuf buf = STRBUF_INIT; + struct expand_data data; + + if (!opt->format) + opt->format = "%(objectname) %(objecttype) %(objectsize)"; + + /* + * Expand once with our special mark_query flag, which will prime the + * object_info to be handed to sha1_object_info_extended for each + * object. + */ + memset(&data, 0, sizeof(data)); + data.mark_query = 1; + strbuf_expand(&buf, opt->format, expand_format, &data); + data.mark_query = 0; while (strbuf_getline(&buf, stdin, '\n') != EOF) { - int error = batch_one_object(buf.buf, print_contents); + char *p; + int error; + + /* + * Split at first whitespace, tying off the beginning of the + * string and saving the remainder (or NULL) in data.rest. + */ + p = strpbrk(buf.buf, " \t"); + if (p) { + while (*p && strchr(" \t", *p)) + *p++ = '\0'; + } + data.rest = p; + + error = batch_one_object(buf.buf, opt, &data); if (error) return error; } @@ -186,10 +303,29 @@ static int git_cat_file_config(const char *var, const char *value, void *cb) return git_default_config(var, value, cb); } +static int batch_option_callback(const struct option *opt, + const char *arg, + int unset) +{ + struct batch_options *bo = opt->value; + + if (unset) { + memset(bo, 0, sizeof(*bo)); + return 0; + } + + bo->enabled = 1; + bo->print_contents = !strcmp(opt->long_name, "batch"); + bo->format = arg; + + return 0; +} + int cmd_cat_file(int argc, const char **argv, const char *prefix) { - int opt = 0, batch = 0; + int opt = 0; const char *exp_type = NULL, *obj_name = NULL; + struct batch_options batch = {0}; const struct option options[] = { OPT_GROUP(N_("<type> can be one of: blob, tree, commit, tag")), @@ -200,12 +336,12 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix) OPT_SET_INT('p', NULL, &opt, N_("pretty-print object's content"), 'p'), OPT_SET_INT(0, "textconv", &opt, N_("for blob objects, run textconv on object's content"), 'c'), - OPT_SET_INT(0, "batch", &batch, - N_("show info and content of objects fed from the standard input"), - BATCH), - OPT_SET_INT(0, "batch-check", &batch, - N_("show info about objects fed from the standard input"), - BATCH_CHECK), + { OPTION_CALLBACK, 0, "batch", &batch, "format", + N_("show info and content of objects fed from the standard input"), + PARSE_OPT_OPTARG, batch_option_callback }, + { OPTION_CALLBACK, 0, "batch-check", &batch, "format", + N_("show info about objects fed from the standard input"), + PARSE_OPT_OPTARG, batch_option_callback }, OPT_END() }; @@ -222,19 +358,19 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix) else usage_with_options(cat_file_usage, options); } - if (!opt && !batch) { + if (!opt && !batch.enabled) { if (argc == 2) { exp_type = argv[0]; obj_name = argv[1]; } else usage_with_options(cat_file_usage, options); } - if (batch && (opt || argc)) { + if (batch.enabled && (opt || argc)) { usage_with_options(cat_file_usage, options); } - if (batch) - return batch_objects(batch); + if (batch.enabled) + return batch_objects(&batch); return cat_one_file(opt, exp_type, obj_name); } @@ -1130,6 +1130,7 @@ extern int unpack_object_header(struct packed_git *, struct pack_window **, off_ struct object_info { /* Request */ unsigned long *sizep; + unsigned long *disk_sizep; /* Response */ enum { diff --git a/pack-revindex.c b/pack-revindex.c index 77a0465be6..b4d2b35bb3 100644 --- a/pack-revindex.c +++ b/pack-revindex.c @@ -59,11 +59,101 @@ static void init_pack_revindex(void) /* revindex elements are lazily initialized */ } -static int cmp_offset(const void *a_, const void *b_) +/* + * This is a least-significant-digit radix sort. + * + * It sorts each of the "n" items in "entries" by its offset field. The "max" + * parameter must be at least as large as the largest offset in the array, + * and lets us quit the sort early. + */ +static void sort_revindex(struct revindex_entry *entries, unsigned n, off_t max) { - const struct revindex_entry *a = a_; - const struct revindex_entry *b = b_; - return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0; + /* + * We use a "digit" size of 16 bits. That keeps our memory + * usage reasonable, and we can generally (for a 4G or smaller + * packfile) quit after two rounds of radix-sorting. + */ +#define DIGIT_SIZE (16) +#define BUCKETS (1 << DIGIT_SIZE) + /* + * We want to know the bucket that a[i] will go into when we are using + * the digit that is N bits from the (least significant) end. + */ +#define BUCKET_FOR(a, i, bits) (((a)[(i)].offset >> (bits)) & (BUCKETS-1)) + + /* + * We need O(n) temporary storage. Rather than do an extra copy of the + * partial results into "entries", we sort back and forth between the + * real array and temporary storage. In each iteration of the loop, we + * keep track of them with alias pointers, always sorting from "from" + * to "to". + */ + struct revindex_entry *tmp = xmalloc(n * sizeof(*tmp)); + struct revindex_entry *from = entries, *to = tmp; + int bits; + unsigned *pos = xmalloc(BUCKETS * sizeof(*pos)); + + /* + * If (max >> bits) is zero, then we know that the radix digit we are + * on (and any higher) will be zero for all entries, and our loop will + * be a no-op, as everybody lands in the same zero-th bucket. + */ + for (bits = 0; max >> bits; bits += DIGIT_SIZE) { + struct revindex_entry *swap; + unsigned i; + + memset(pos, 0, BUCKETS * sizeof(*pos)); + + /* + * We want pos[i] to store the index of the last element that + * will go in bucket "i" (actually one past the last element). + * To do this, we first count the items that will go in each + * bucket, which gives us a relative offset from the last + * bucket. We can then cumulatively add the index from the + * previous bucket to get the true index. + */ + for (i = 0; i < n; i++) + pos[BUCKET_FOR(from, i, bits)]++; + for (i = 1; i < BUCKETS; i++) + pos[i] += pos[i-1]; + + /* + * Now we can drop the elements into their correct buckets (in + * our temporary array). We iterate the pos counter backwards + * to avoid using an extra index to count up. And since we are + * going backwards there, we must also go backwards through the + * array itself, to keep the sort stable. + * + * Note that we use an unsigned iterator to make sure we can + * handle 2^32-1 objects, even on a 32-bit system. But this + * means we cannot use the more obvious "i >= 0" loop condition + * for counting backwards, and must instead check for + * wrap-around with UINT_MAX. + */ + for (i = n - 1; i != UINT_MAX; i--) + to[--pos[BUCKET_FOR(from, i, bits)]] = from[i]; + + /* + * Now "to" contains the most sorted list, so we swap "from" and + * "to" for the next iteration. + */ + swap = from; + from = to; + to = swap; + } + + /* + * If we ended with our data in the original array, great. If not, + * we have to move it back from the temporary storage. + */ + if (from != entries) + memcpy(entries, tmp, n * sizeof(*entries)); + free(tmp); + free(pos); + +#undef BUCKET_FOR +#undef BUCKETS +#undef DIGIT_SIZE } /* @@ -72,8 +162,8 @@ static int cmp_offset(const void *a_, const void *b_) static void create_pack_revindex(struct pack_revindex *rix) { struct packed_git *p = rix->p; - int num_ent = p->num_objects; - int i; + unsigned num_ent = p->num_objects; + unsigned i; const char *index = p->index_data; rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1)); @@ -108,13 +198,13 @@ static void create_pack_revindex(struct pack_revindex *rix) */ rix->revindex[num_ent].offset = p->pack_size - 20; rix->revindex[num_ent].nr = -1; - qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset); + sort_revindex(rix->revindex, num_ent, p->pack_size); } struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs) { int num; - int lo, hi; + unsigned lo, hi; struct pack_revindex *rix; struct revindex_entry *revindex; @@ -132,7 +222,7 @@ struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs) lo = 0; hi = p->num_objects + 1; do { - int mi = (lo + hi) / 2; + unsigned mi = lo + (hi - lo) / 2; if (revindex[mi].offset == ofs) { return revindex + mi; } else if (ofs < revindex[mi].offset) diff --git a/sha1_file.c b/sha1_file.c index 0af19c00f1..4c2365f48f 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1697,7 +1697,8 @@ static int retry_bad_packed_offset(struct packed_git *p, off_t obj_offset) #define POI_STACK_PREALLOC 64 static int packed_object_info(struct packed_git *p, off_t obj_offset, - unsigned long *sizep, int *rtype) + unsigned long *sizep, int *rtype, + unsigned long *disk_sizep) { struct pack_window *w_curs = NULL; unsigned long size; @@ -1731,6 +1732,11 @@ static int packed_object_info(struct packed_git *p, off_t obj_offset, } } + if (disk_sizep) { + struct revindex_entry *revidx = find_pack_revindex(p, obj_offset); + *disk_sizep = revidx[1].offset - obj_offset; + } + while (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) { off_t base_offset; /* Push the object we're going to leave behind */ @@ -2357,7 +2363,8 @@ struct packed_git *find_sha1_pack(const unsigned char *sha1, } -static int sha1_loose_object_info(const unsigned char *sha1, unsigned long *sizep) +static int sha1_loose_object_info(const unsigned char *sha1, unsigned long *sizep, + unsigned long *disk_sizep) { int status; unsigned long mapsize, size; @@ -2368,6 +2375,8 @@ static int sha1_loose_object_info(const unsigned char *sha1, unsigned long *size map = map_sha1_file(sha1, &mapsize); if (!map) return -1; + if (disk_sizep) + *disk_sizep = mapsize; if (unpack_sha1_header(&stream, map, mapsize, hdr, sizeof(hdr)) < 0) status = error("unable to unpack %s header", sha1_to_hex(sha1)); @@ -2391,13 +2400,15 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi) if (co) { if (oi->sizep) *(oi->sizep) = co->size; + if (oi->disk_sizep) + *(oi->disk_sizep) = 0; oi->whence = OI_CACHED; return co->type; } if (!find_pack_entry(sha1, &e)) { /* Most likely it's a loose object. */ - status = sha1_loose_object_info(sha1, oi->sizep); + status = sha1_loose_object_info(sha1, oi->sizep, oi->disk_sizep); if (status >= 0) { oi->whence = OI_LOOSE; return status; @@ -2409,7 +2420,8 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi) return status; } - status = packed_object_info(e.p, e.offset, oi->sizep, &rtype); + status = packed_object_info(e.p, e.offset, oi->sizep, &rtype, + oi->disk_sizep); if (status < 0) { mark_bad_packed_object(e.p, sha1); status = sha1_object_info_extended(sha1, oi); @@ -2428,7 +2440,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi) int sha1_object_info(const unsigned char *sha1, unsigned long *sizep) { - struct object_info oi; + struct object_info oi = {0}; oi.sizep = sizep; return sha1_object_info_extended(sha1, &oi); diff --git a/streaming.c b/streaming.c index cabcd9d157..cac282f06b 100644 --- a/streaming.c +++ b/streaming.c @@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1, struct stream_filter *filter) { struct git_istream *st; - struct object_info oi; + struct object_info oi = {0}; const unsigned char *real = lookup_replace_object(sha1); enum input_source src = istream_source(real, type, &oi); diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 9cc5c6bf4d..d499d02a29 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -36,66 +36,54 @@ $content" ' test_expect_success "Type of $type is correct" ' - test $type = "$(git cat-file -t $sha1)" + echo $type >expect && + git cat-file -t $sha1 >actual && + test_cmp expect actual ' test_expect_success "Size of $type is correct" ' - test $size = "$(git cat-file -s $sha1)" + echo $size >expect && + git cat-file -s $sha1 >actual && + test_cmp expect actual ' test -z "$content" || test_expect_success "Content of $type is correct" ' - expect="$(maybe_remove_timestamp "$content" $no_ts)" - actual="$(maybe_remove_timestamp "$(git cat-file $type $sha1)" $no_ts)" - - if test "z$expect" = "z$actual" - then - : happy - else - echo "Oops: expected $expect" - echo "but got $actual" - false - fi + maybe_remove_timestamp "$content" $no_ts >expect && + maybe_remove_timestamp "$(git cat-file $type $sha1)" $no_ts >actual && + test_cmp expect actual ' test_expect_success "Pretty content of $type is correct" ' - expect="$(maybe_remove_timestamp "$pretty_content" $no_ts)" - actual="$(maybe_remove_timestamp "$(git cat-file -p $sha1)" $no_ts)" - if test "z$expect" = "z$actual" - then - : happy - else - echo "Oops: expected $expect" - echo "but got $actual" - false - fi + maybe_remove_timestamp "$pretty_content" $no_ts >expect && + maybe_remove_timestamp "$(git cat-file -p $sha1)" $no_ts >actual && + test_cmp expect actual ' test -z "$content" || test_expect_success "--batch output of $type is correct" ' - expect="$(maybe_remove_timestamp "$batch_output" $no_ts)" - actual="$(maybe_remove_timestamp "$(echo $sha1 | git cat-file --batch)" $no_ts)" - if test "z$expect" = "z$actual" - then - : happy - else - echo "Oops: expected $expect" - echo "but got $actual" - false - fi + maybe_remove_timestamp "$batch_output" $no_ts >expect && + maybe_remove_timestamp "$(echo $sha1 | git cat-file --batch)" $no_ts >actual && + test_cmp expect actual ' test_expect_success "--batch-check output of $type is correct" ' - expect="$sha1 $type $size" - actual="$(echo_without_newline $sha1 | git cat-file --batch-check)" - if test "z$expect" = "z$actual" - then - : happy - else - echo "Oops: expected $expect" - echo "but got $actual" - false - fi + echo "$sha1 $type $size" >expect && + echo_without_newline $sha1 | git cat-file --batch-check >actual && + test_cmp expect actual + ' + + test_expect_success "custom --batch-check format" ' + echo "$type $sha1" >expect && + echo $sha1 | git cat-file --batch-check="%(objecttype) %(objectname)" >actual && + test_cmp expect actual + ' + + test_expect_success '--batch-check with %(rest)' ' + echo "$type this is some extra content" >expect && + echo "$sha1 this is some extra content" | + git cat-file --batch-check="%(objecttype) %(rest)" >actual && + test_cmp expect actual ' } |