summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/technical/commit-graph-format.txt30
-rw-r--r--commit-graph.c113
-rw-r--r--commit-graph.h5
3 files changed, 147 insertions, 1 deletions
diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt
index a4f17441ae..de56f9f1ef 100644
--- a/Documentation/technical/commit-graph-format.txt
+++ b/Documentation/technical/commit-graph-format.txt
@@ -17,6 +17,9 @@ metadata, including:
- The parents of the commit, stored using positional references within
the graph file.
+- The Bloom filter of the commit carrying the paths that were changed between
+ the commit and its first parent, if requested.
+
These positional references are stored as unsigned 32-bit integers
corresponding to the array position within the list of commit OIDs. Due
to some special constants we use to track parents, we can store at most
@@ -93,6 +96,33 @@ CHUNK DATA:
positions for the parents until reaching a value with the most-significant
bit on. The other bits correspond to the position of the last parent.
+ Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional]
+ * The ith entry, BIDX[i], stores the number of 8-byte word blocks in all
+ Bloom filters from commit 0 to commit i (inclusive) in lexicographic
+ order. The Bloom filter for the i-th commit spans from BIDX[i-1] to
+ BIDX[i] (plus header length), where BIDX[-1] is 0.
+ * The BIDX chunk is ignored if the BDAT chunk is not present.
+
+ Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
+ * It starts with header consisting of three unsigned 32-bit integers:
+ - Version of the hash algorithm being used. We currently only support
+ value 1 which corresponds to the 32-bit version of the murmur3 hash
+ implemented exactly as described in
+ https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
+ hashing technique using seed values 0x293ae76f and 0x7e646e2 as
+ described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
+ in Probabilistic Verification"
+ - The number of times a path is hashed and hence the number of bit positions
+ that cumulatively determine whether a file is present in the commit.
+ - The minimum number of bits 'b' per entry in the Bloom filter. If the filter
+ contains 'n' entries, then the filter size is the minimum number of 64-bit
+ words that contain n*b bits.
+ * The rest of the chunk is the concatenation of all the computed Bloom
+ filters for the commits in lexicographic order.
+ * Note: Commits with no changes or more than 512 changes have Bloom filters
+ of length zero.
+ * The BDAT chunk is present if and only if BIDX is present.
+
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]
This list of H-byte hashes describe a set of B commit-graph files that
form a commit-graph chain. The graph position for the ith commit in this
diff --git a/commit-graph.c b/commit-graph.c
index 732c81fa1b..a8b6b5cca5 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -24,8 +24,10 @@
#define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
#define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
+#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
+#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
-#define MAX_NUM_CHUNKS 5
+#define MAX_NUM_CHUNKS 7
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
@@ -319,6 +321,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
chunk_repeated = 1;
else
graph->chunk_base_graphs = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_BLOOMINDEXES:
+ if (graph->chunk_bloom_indexes)
+ chunk_repeated = 1;
+ else
+ graph->chunk_bloom_indexes = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_BLOOMDATA:
+ if (graph->chunk_bloom_data)
+ chunk_repeated = 1;
+ else {
+ uint32_t hash_version;
+ graph->chunk_bloom_data = data + chunk_offset;
+ hash_version = get_be32(data + chunk_offset);
+
+ if (hash_version != 1)
+ break;
+
+ graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
+ graph->bloom_filter_settings->hash_version = hash_version;
+ graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
+ graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
+ }
+ break;
}
if (chunk_repeated) {
@@ -337,6 +365,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
last_chunk_offset = chunk_offset;
}
+ if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
+ init_bloom_filters();
+ } else {
+ /* We need both the bloom chunks to exist together. Else ignore the data */
+ graph->chunk_bloom_indexes = NULL;
+ graph->chunk_bloom_data = NULL;
+ graph->bloom_filter_settings = NULL;
+ }
+
hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
if (verify_commit_graph_lite(graph)) {
@@ -1034,6 +1071,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
}
}
+static void write_graph_chunk_bloom_indexes(struct hashfile *f,
+ struct write_commit_graph_context *ctx)
+{
+ struct commit **list = ctx->commits.list;
+ struct commit **last = ctx->commits.list + ctx->commits.nr;
+ uint32_t cur_pos = 0;
+ struct progress *progress = NULL;
+ int i = 0;
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Writing changed paths Bloom filters index"),
+ ctx->commits.nr);
+
+ while (list < last) {
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+ cur_pos += filter->len;
+ display_progress(progress, ++i);
+ hashwrite_be32(f, cur_pos);
+ list++;
+ }
+
+ stop_progress(&progress);
+}
+
+static void write_graph_chunk_bloom_data(struct hashfile *f,
+ struct write_commit_graph_context *ctx,
+ const struct bloom_filter_settings *settings)
+{
+ struct commit **list = ctx->commits.list;
+ struct commit **last = ctx->commits.list + ctx->commits.nr;
+ struct progress *progress = NULL;
+ int i = 0;
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Writing changed paths Bloom filters data"),
+ ctx->commits.nr);
+
+ hashwrite_be32(f, settings->hash_version);
+ hashwrite_be32(f, settings->num_hashes);
+ hashwrite_be32(f, settings->bits_per_entry);
+
+ while (list < last) {
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+ display_progress(progress, ++i);
+ hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
+ list++;
+ }
+
+ stop_progress(&progress);
+}
+
static int oid_compare(const void *_a, const void *_b)
{
const struct object_id *a = (const struct object_id *)_a;
@@ -1438,6 +1528,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
struct strbuf progress_title = STRBUF_INIT;
int num_chunks = 3;
struct object_id file_hash;
+ const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
if (ctx->split) {
struct strbuf tmp_file = STRBUF_INIT;
@@ -1482,6 +1573,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
num_chunks++;
}
+ if (ctx->changed_paths) {
+ chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+ num_chunks++;
+ chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+ num_chunks++;
+ }
if (ctx->num_commit_graphs_after > 1) {
chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
num_chunks++;
@@ -1500,6 +1597,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
4 * ctx->num_extra_edges;
num_chunks++;
}
+ if (ctx->changed_paths) {
+ chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+ sizeof(uint32_t) * ctx->commits.nr;
+ num_chunks++;
+
+ chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+ sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
+ num_chunks++;
+ }
if (ctx->num_commit_graphs_after > 1) {
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
hashsz * (ctx->num_commit_graphs_after - 1);
@@ -1537,6 +1643,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
write_graph_chunk_data(f, hashsz, ctx);
if (ctx->num_extra_edges)
write_graph_chunk_extra_edges(f, ctx);
+ if (ctx->changed_paths) {
+ write_graph_chunk_bloom_indexes(f, ctx);
+ write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
+ }
if (ctx->num_commit_graphs_after > 1 &&
write_graph_chunk_base(f, ctx)) {
return -1;
@@ -2184,6 +2294,7 @@ void free_commit_graph(struct commit_graph *g)
close(g->graph_fd);
}
free(g->filename);
+ free(g->bloom_filter_settings);
free(g);
}
diff --git a/commit-graph.h b/commit-graph.h
index 86be81219d..8e7a8e0e5b 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -11,6 +11,7 @@
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
struct commit;
+struct bloom_filter_settings;
char *get_commit_graph_filename(struct object_directory *odb);
int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
@@ -59,6 +60,10 @@ struct commit_graph {
const unsigned char *chunk_commit_data;
const unsigned char *chunk_extra_edges;
const unsigned char *chunk_base_graphs;
+ const unsigned char *chunk_bloom_indexes;
+ const unsigned char *chunk_bloom_data;
+
+ struct bloom_filter_settings *bloom_filter_settings;
};
struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st,