summaryrefslogtreecommitdiff
path: root/commit-graph.c
diff options
context:
space:
mode:
authorLibravatar Junio C Hamano <gitster@pobox.com>2020-05-01 13:39:53 -0700
committerLibravatar Junio C Hamano <gitster@pobox.com>2020-05-01 13:39:53 -0700
commit9b6606f43d55bbf33b9924d16e02e60e1c09660a (patch)
tree0082094df8d1e99873fa1e72628fa4e02ec3a282 /commit-graph.c
parentMerge branch 'tb/commit-graph-fd-exhaustion-fix' (diff)
parentbloom: ignore renames when computing changed paths (diff)
downloadtgif-9b6606f43d55bbf33b9924d16e02e60e1c09660a.tar.xz
Merge branch 'gs/commit-graph-path-filter'
Introduce an extension to the commit-graph to make it efficient to check for the paths that were modified at each commit using Bloom filters. * gs/commit-graph-path-filter: bloom: ignore renames when computing changed paths commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag t4216: add end to end tests for git log with Bloom filters revision.c: add trace2 stats around Bloom filter usage revision.c: use Bloom filters to speed up path based revision walks commit-graph: add --changed-paths option to write subcommand commit-graph: reuse existing Bloom filters during write commit-graph: write Bloom filters to commit graph file commit-graph: examine commits by generation number commit-graph: examine changed-path objects in pack order commit-graph: compute Bloom filters for changed paths diff: halt tree-diff early after max_changes bloom.c: core Bloom filter implementation for changed paths. bloom.c: introduce core Bloom filter constructs bloom.c: add the murmur3 hash implementation commit-graph: define and use MAX_NUM_CHUNKS
Diffstat (limited to 'commit-graph.c')
-rw-r--r--commit-graph.c213
1 files changed, 207 insertions, 6 deletions
diff --git a/commit-graph.c b/commit-graph.c
index e9b3c5d561..7eb4f22f00 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -16,13 +16,18 @@
#include "hashmap.h"
#include "replace-object.h"
#include "progress.h"
+#include "bloom.h"
+#include "commit-slab.h"
#define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
#define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
#define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
#define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
+#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
+#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
+#define MAX_NUM_CHUNKS 7
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
@@ -44,9 +49,51 @@
/* Remember to update object flag allocation in object.h */
#define REACHABLE (1u<<15)
-char *get_commit_graph_filename(struct object_directory *odb)
+/* Keep track of the order in which commits are added to our list. */
+define_commit_slab(commit_pos, int);
+static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
+
+static void set_commit_pos(struct repository *r, const struct object_id *oid)
+{
+ static int32_t max_pos;
+ struct commit *commit = lookup_commit(r, oid);
+
+ if (!commit)
+ return; /* should never happen, but be lenient */
+
+ *commit_pos_at(&commit_pos, commit) = max_pos++;
+}
+
+static int commit_pos_cmp(const void *va, const void *vb)
+{
+ const struct commit *a = *(const struct commit **)va;
+ const struct commit *b = *(const struct commit **)vb;
+ return commit_pos_at(&commit_pos, a) -
+ commit_pos_at(&commit_pos, b);
+}
+
+static int commit_gen_cmp(const void *va, const void *vb)
+{
+ const struct commit *a = *(const struct commit **)va;
+ const struct commit *b = *(const struct commit **)vb;
+
+ /* lower generation commits first */
+ if (a->generation < b->generation)
+ return -1;
+ else if (a->generation > b->generation)
+ return 1;
+
+ /* use date as a heuristic when generations are equal */
+ if (a->date < b->date)
+ return -1;
+ else if (a->date > b->date)
+ return 1;
+ return 0;
+}
+
+char *get_commit_graph_filename(struct object_directory *obj_dir)
{
- return xstrfmt("%s/info/commit-graph", odb->path);
+ return xstrfmt("%s/info/commit-graph", obj_dir->path);
}
static char *get_split_graph_filename(struct object_directory *odb,
@@ -270,6 +317,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, size_t graph_size)
chunk_repeated = 1;
else
graph->chunk_base_graphs = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_BLOOMINDEXES:
+ if (graph->chunk_bloom_indexes)
+ chunk_repeated = 1;
+ else
+ graph->chunk_bloom_indexes = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_BLOOMDATA:
+ if (graph->chunk_bloom_data)
+ chunk_repeated = 1;
+ else {
+ uint32_t hash_version;
+ graph->chunk_bloom_data = data + chunk_offset;
+ hash_version = get_be32(data + chunk_offset);
+
+ if (hash_version != 1)
+ break;
+
+ graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
+ graph->bloom_filter_settings->hash_version = hash_version;
+ graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
+ graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
+ }
+ break;
}
if (chunk_repeated) {
@@ -288,6 +361,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, size_t graph_size)
last_chunk_offset = chunk_offset;
}
+ if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
+ init_bloom_filters();
+ } else {
+ /* We need both the bloom chunks to exist together. Else ignore the data */
+ graph->chunk_bloom_indexes = NULL;
+ graph->chunk_bloom_data = NULL;
+ graph->bloom_filter_settings = NULL;
+ }
+
hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
if (verify_commit_graph_lite(graph)) {
@@ -784,9 +866,12 @@ struct write_commit_graph_context {
unsigned append:1,
report_progress:1,
split:1,
- check_oids:1;
+ check_oids:1,
+ changed_paths:1,
+ order_by_pack:1;
const struct split_commit_graph_opts *split_opts;
+ size_t total_bloom_filter_data_size;
};
static void write_graph_chunk_fanout(struct hashfile *f,
@@ -982,6 +1067,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
}
}
+static void write_graph_chunk_bloom_indexes(struct hashfile *f,
+ struct write_commit_graph_context *ctx)
+{
+ struct commit **list = ctx->commits.list;
+ struct commit **last = ctx->commits.list + ctx->commits.nr;
+ uint32_t cur_pos = 0;
+ struct progress *progress = NULL;
+ int i = 0;
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Writing changed paths Bloom filters index"),
+ ctx->commits.nr);
+
+ while (list < last) {
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
+ cur_pos += filter->len;
+ display_progress(progress, ++i);
+ hashwrite_be32(f, cur_pos);
+ list++;
+ }
+
+ stop_progress(&progress);
+}
+
+static void write_graph_chunk_bloom_data(struct hashfile *f,
+ struct write_commit_graph_context *ctx,
+ const struct bloom_filter_settings *settings)
+{
+ struct commit **list = ctx->commits.list;
+ struct commit **last = ctx->commits.list + ctx->commits.nr;
+ struct progress *progress = NULL;
+ int i = 0;
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Writing changed paths Bloom filters data"),
+ ctx->commits.nr);
+
+ hashwrite_be32(f, settings->hash_version);
+ hashwrite_be32(f, settings->num_hashes);
+ hashwrite_be32(f, settings->bits_per_entry);
+
+ while (list < last) {
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
+ display_progress(progress, ++i);
+ hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
+ list++;
+ }
+
+ stop_progress(&progress);
+}
+
static int oid_compare(const void *_a, const void *_b)
{
const struct object_id *a = (const struct object_id *)_a;
@@ -1013,6 +1151,8 @@ static int add_packed_commits(const struct object_id *oid,
oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
ctx->oids.nr++;
+ set_commit_pos(ctx->r, oid);
+
return 0;
}
@@ -1132,6 +1272,38 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
stop_progress(&ctx->progress);
}
+static void compute_bloom_filters(struct write_commit_graph_context *ctx)
+{
+ int i;
+ struct progress *progress = NULL;
+ struct commit **sorted_commits;
+
+ init_bloom_filters();
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Computing commit changed paths Bloom filters"),
+ ctx->commits.nr);
+
+ ALLOC_ARRAY(sorted_commits, ctx->commits.nr);
+ COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr);
+
+ if (ctx->order_by_pack)
+ QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp);
+ else
+ QSORT(sorted_commits, ctx->commits.nr, commit_gen_cmp);
+
+ for (i = 0; i < ctx->commits.nr; i++) {
+ struct commit *c = sorted_commits[i];
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1);
+ ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
+ display_progress(progress, i + 1);
+ }
+
+ free(sorted_commits);
+ stop_progress(&progress);
+}
+
static int add_ref_to_set(const char *refname,
const struct object_id *oid,
int flags, void *cb_data)
@@ -1361,12 +1533,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
int fd;
struct hashfile *f;
struct lock_file lk = LOCK_INIT;
- uint32_t chunk_ids[6];
- uint64_t chunk_offsets[6];
+ uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
+ uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
const unsigned hashsz = the_hash_algo->rawsz;
struct strbuf progress_title = STRBUF_INIT;
int num_chunks = 3;
struct object_id file_hash;
+ const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
if (ctx->split) {
struct strbuf tmp_file = STRBUF_INIT;
@@ -1411,6 +1584,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
num_chunks++;
}
+ if (ctx->changed_paths) {
+ chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+ num_chunks++;
+ chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+ num_chunks++;
+ }
if (ctx->num_commit_graphs_after > 1) {
chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
num_chunks++;
@@ -1429,6 +1608,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
4 * ctx->num_extra_edges;
num_chunks++;
}
+ if (ctx->changed_paths) {
+ chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+ sizeof(uint32_t) * ctx->commits.nr;
+ num_chunks++;
+
+ chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+ sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
+ num_chunks++;
+ }
if (ctx->num_commit_graphs_after > 1) {
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
hashsz * (ctx->num_commit_graphs_after - 1);
@@ -1466,6 +1654,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
write_graph_chunk_data(f, hashsz, ctx);
if (ctx->num_extra_edges)
write_graph_chunk_extra_edges(f, ctx);
+ if (ctx->changed_paths) {
+ write_graph_chunk_bloom_indexes(f, ctx);
+ write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
+ }
if (ctx->num_commit_graphs_after > 1 &&
write_graph_chunk_base(f, ctx)) {
return -1;
@@ -1804,6 +1996,8 @@ int write_commit_graph(struct object_directory *odb,
ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
ctx->split_opts = split_opts;
+ ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
+ ctx->total_bloom_filter_data_size = 0;
if (ctx->split) {
struct commit_graph *g;
@@ -1856,6 +2050,7 @@ int write_commit_graph(struct object_directory *odb,
}
if (pack_indexes) {
+ ctx->order_by_pack = 1;
if ((res = fill_oids_from_packs(ctx, pack_indexes)))
goto cleanup;
}
@@ -1865,8 +2060,10 @@ int write_commit_graph(struct object_directory *odb,
goto cleanup;
}
- if (!pack_indexes && !commits)
+ if (!pack_indexes && !commits) {
+ ctx->order_by_pack = 1;
fill_oids_from_all_packs(ctx);
+ }
close_reachable(ctx);
@@ -1902,6 +2099,9 @@ int write_commit_graph(struct object_directory *odb,
compute_generation_numbers(ctx);
+ if (ctx->changed_paths)
+ compute_bloom_filters(ctx);
+
res = write_commit_graph_file(ctx);
if (ctx->split)
@@ -2126,6 +2326,7 @@ void free_commit_graph(struct commit_graph *g)
g->data = NULL;
}
free(g->filename);
+ free(g->bloom_filter_settings);
free(g);
}