From 5010cb5fcca30269ad25f2eb38b31455af3205d7 Mon Sep 17 00:00:00 2001 From: Junio C Hamano <junkio@cox.net> Date: Sun, 30 Apr 2006 23:28:15 -0700 Subject: built-in "git grep" This attempts to set up built-in "git grep" to further reduce our dependence on the shell, while at the same time optionally allowing to run grep against object database. You could do funky things like these: git grep --cached -e pattern ;# grep from index git grep -e pattern master ;# or in a rev git grep -e pattern master next ;# or in multiple revs git grep -e pattern pu^@ ;# even like this with an ;# extension from another topic ;-) git grep -e pattern master..next ;# or even from rev ranges git grep -e pattern master~20:Documentation ;# or an arbitrary tree git grep -e pattern next:git-commit.sh ;# or an arbitrary blob Right now, it does not understand and/or obey many options grep should accept, and the pattern must be given with -e option due to the way the parameter parser is structured, both of which obviously need to be fixed for usability. But this is going in the right direction. The shell script version is one of the worst Portability offender in the git barebone Porcelainish; it uses xargs -0 to pass paths around and shell arrays to sift flags and parameters. Signed-off-by: Junio C Hamano <junkio@cox.net> --- Makefile | 2 +- builtin-grep.c | 454 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ builtin.h | 1 + git.c | 1 + 4 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 builtin-grep.c diff --git a/Makefile b/Makefile index 8ce27a65fb..8d5122bdd5 100644 --- a/Makefile +++ b/Makefile @@ -214,7 +214,7 @@ LIB_OBJS = \ $(DIFF_OBJS) BUILTIN_OBJS = \ - builtin-log.o builtin-help.o + builtin-log.o builtin-help.o builtin-grep.o GITLIBS = $(LIB_FILE) $(XDIFF_LIB) LIBS = $(GITLIBS) -lz diff --git a/builtin-grep.c b/builtin-grep.c new file mode 100644 index 0000000000..36150bf4ef --- /dev/null +++ b/builtin-grep.c @@ -0,0 +1,454 @@ +/* + * Builtin "git grep" + * + * Copyright (c) 2006 Junio C Hamano + */ +#include "cache.h" +#include "blob.h" +#include "tree.h" +#include "commit.h" +#include "tag.h" +#include "diff.h" +#include "revision.h" +#include "builtin.h" +#include <regex.h> + +static int pathspec_matches(struct diff_options *opt, const char *name) +{ + int i, j; + int namelen; + if (!opt->nr_paths) + return 1; + namelen = strlen(name); + for (i = 0; i < opt->nr_paths; i++) { + const char *match = opt->paths[i]; + int matchlen = opt->pathlens[i]; + if (matchlen <= namelen) { + if (!strncmp(name, match, matchlen)) + return 1; + continue; + } + /* If name is "Documentation" and pathspec is + * "Documentation/", they should match. Maybe + * we would want to strip it in get_pathspec()??? + */ + if (strncmp(name, match, namelen)) + continue; + for (j = namelen; j < matchlen; j++) + if (match[j] != '/') + break; + if (matchlen <= j) + return 1; + } + return 0; +} + +struct grep_opt { + const char *pattern; + regex_t regexp; + unsigned linenum:1; + unsigned invert:1; + int regflags; + unsigned pre_context; + unsigned post_context; +}; + +static char *end_of_line(char *cp, unsigned long *left) +{ + unsigned long l = *left; + while (l && *cp != '\n') { + l--; + cp++; + } + *left = l; + return cp; +} + +static void show_line(struct grep_opt *opt, const char *bol, const char *eol, + const char *name, unsigned lno, char sign) +{ + printf("%s%c", name, sign); + if (opt->linenum) + printf("%d%c", lno, sign); + printf("%.*s\n", eol-bol, bol); +} + +static int grep_buffer(struct grep_opt *opt, const char *name, + char *buf, unsigned long size) +{ + char *bol = buf; + unsigned long left = size; + unsigned lno = 1; + struct pre_context_line { + char *bol; + char *eol; + } *prev = NULL, *pcl; + unsigned last_hit = 0; + unsigned last_shown = 0; + const char *hunk_mark = ""; + + if (opt->pre_context) + prev = xcalloc(opt->pre_context, sizeof(*prev)); + if (opt->pre_context || opt->post_context) + hunk_mark = "--\n"; + + while (left) { + regmatch_t pmatch[10]; + char *eol, ch; + int hit; + + eol = end_of_line(bol, &left); + ch = *eol; + *eol = 0; + + hit = !regexec(&opt->regexp, bol, ARRAY_SIZE(pmatch), + pmatch, 0); + if (opt->invert) + hit = !hit; + if (hit) { + /* Hit at this line. If we haven't shown the + * pre-context lines, we would need to show them. + */ + if (opt->pre_context) { + unsigned from; + if (opt->pre_context < lno) + from = lno - opt->pre_context; + else + from = 1; + if (from <= last_shown) + from = last_shown + 1; + if (last_shown && from != last_shown + 1) + printf(hunk_mark); + while (from < lno) { + pcl = &prev[lno-from-1]; + show_line(opt, pcl->bol, pcl->eol, + name, from, '-'); + from++; + } + last_shown = lno-1; + } + if (last_shown && lno != last_shown + 1) + printf(hunk_mark); + show_line(opt, bol, eol, name, lno, ':'); + last_shown = last_hit = lno; + } + else if (last_hit && + lno <= last_hit + opt->post_context) { + /* If the last hit is within the post context, + * we need to show this line. + */ + if (last_shown && lno != last_shown + 1) + printf(hunk_mark); + show_line(opt, bol, eol, name, lno, '-'); + last_shown = lno; + } + if (opt->pre_context) { + memmove(prev+1, prev, + (opt->pre_context-1) * sizeof(*prev)); + prev->bol = bol; + prev->eol = eol; + } + *eol = ch; + bol = eol + 1; + left--; + lno++; + } + return !!last_hit; +} + +static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1, const char *name) +{ + unsigned long size; + char *data; + char type[20]; + int hit; + data = read_sha1_file(sha1, type, &size); + if (!data) { + error("'%s': unable to read %s", name, sha1_to_hex(sha1)); + return 0; + } + hit = grep_buffer(opt, name, data, size); + free(data); + return hit; +} + +static int grep_file(struct grep_opt *opt, const char *filename) +{ + struct stat st; + int i; + char *data; + if (lstat(filename, &st) < 0) { + err_ret: + if (errno != ENOENT) + error("'%s': %s", filename, strerror(errno)); + return 0; + } + if (!st.st_size) + return 0; /* empty file -- no grep hit */ + if (!S_ISREG(st.st_mode)) + return 0; + i = open(filename, O_RDONLY); + if (i < 0) + goto err_ret; + data = xmalloc(st.st_size + 1); + if (st.st_size != xread(i, data, st.st_size)) { + error("'%s': short read %s", filename, strerror(errno)); + close(i); + free(data); + return 0; + } + close(i); + i = grep_buffer(opt, filename, data, st.st_size); + free(data); + return i; +} + +static int grep_cache(struct grep_opt *opt, struct rev_info *revs, int cached) +{ + int hit = 0; + int nr; + read_cache(); + + for (nr = 0; nr < active_nr; nr++) { + struct cache_entry *ce = active_cache[nr]; + if (ce_stage(ce) || !S_ISREG(ntohl(ce->ce_mode))) + continue; + if (!pathspec_matches(&revs->diffopt, ce->name)) + continue; + if (cached) + hit |= grep_sha1(opt, ce->sha1, ce->name); + else + hit |= grep_file(opt, ce->name); + } + return hit; +} + +static int grep_tree(struct grep_opt *opt, struct rev_info *revs, + struct tree_desc *tree, + const char *tree_name, const char *base) +{ + unsigned mode; + int len; + int hit = 0; + const char *path; + const unsigned char *sha1; + char *down_base; + char *path_buf = xmalloc(PATH_MAX + strlen(tree_name) + 100); + + if (tree_name[0]) { + int offset = sprintf(path_buf, "%s:", tree_name); + down_base = path_buf + offset; + strcat(down_base, base); + } + else { + down_base = path_buf; + strcpy(down_base, base); + } + len = strlen(path_buf); + + while (tree->size) { + int pathlen; + sha1 = tree_entry_extract(tree, &path, &mode); + pathlen = strlen(path); + strcpy(path_buf + len, path); + + if (!pathspec_matches(&revs->diffopt, down_base)) + ; + else if (S_ISREG(mode)) + hit |= grep_sha1(opt, sha1, path_buf); + else if (S_ISDIR(mode)) { + char type[20]; + struct tree_desc sub; + void *data; + data = read_sha1_file(sha1, type, &sub.size); + if (!data) + die("unable to read tree (%s)", + sha1_to_hex(sha1)); + strcpy(path_buf + len + pathlen, "/"); + sub.buf = data; + hit = grep_tree(opt, revs, &sub, tree_name, down_base); + free(data); + } + update_tree_entry(tree); + } + return hit; +} + +static int grep_object(struct grep_opt *opt, struct rev_info *revs, + struct object *obj, const char *name) +{ + if (!strcmp(obj->type, blob_type)) + return grep_sha1(opt, obj->sha1, name); + if (!strcmp(obj->type, commit_type) || + !strcmp(obj->type, tree_type)) { + struct tree_desc tree; + void *data; + int hit; + data = read_object_with_reference(obj->sha1, tree_type, + &tree.size, NULL); + if (!data) + die("unable to read tree (%s)", sha1_to_hex(obj->sha1)); + tree.buf = data; + hit = grep_tree(opt, revs, &tree, name, ""); + free(data); + return hit; + } + die("unable to grep from object of type %s", obj->type); +} + +static const char builtin_grep_usage[] = +"git-grep <option>* <rev>* [-e] <pattern> [<path>...]"; + +int cmd_grep(int argc, const char **argv, char **envp) +{ + struct rev_info rev; + const char **dst, **src; + int err; + int hit = 0; + int no_more_arg = 0; + int seen_range = 0; + int seen_noncommit = 0; + int cached = 0; + struct grep_opt opt; + struct object_list *list; + + memset(&opt, 0, sizeof(opt)); + opt.regflags = REG_NEWLINE; + + /* + * Interpret and remove the grep options upfront. Sigh... + */ + for (dst = src = &argv[1]; src < argc + argv; ) { + const char *arg = *src++; + if (!no_more_arg) { + if (!strcmp("--", arg)) { + no_more_arg = 1; + *dst++ = arg; + continue; + } + if (!strcmp("--cached", arg)) { + cached = 1; + continue; + } + if (!strcmp("-i", arg) || + !strcmp("--ignore-case", arg)) { + opt.regflags |= REG_ICASE; + continue; + } + if (!strcmp("-v", arg) || + !strcmp("--invert-match", arg)) { + opt.invert = 1; + continue; + } + if (!strcmp("-E", arg) || + !strcmp("--extended-regexp", arg)) { + opt.regflags |= REG_EXTENDED; + continue; + } + if (!strcmp("-G", arg) || + !strcmp("--basic-regexp", arg)) { + opt.regflags &= ~REG_EXTENDED; + continue; + } + if (!strcmp("-e", arg)) { + if (src < argc + argv) { + opt.pattern = *src++; + continue; + } + usage(builtin_grep_usage); + } + if (!strcmp("-n", arg)) { + opt.linenum = 1; + continue; + } + if (!strcmp("-H", arg)) { + /* We always show the pathname, so this + * is a noop. + */ + continue; + } + if (!strcmp("-A", arg) || + !strcmp("-B", arg) || + !strcmp("-C", arg)) { + unsigned num; + if ((argc + argv <= src) || + sscanf(*src++, "%u", &num) != 1) + usage(builtin_grep_usage); + switch (arg[1]) { + case 'A': + opt.post_context = num; + break; + case 'C': + opt.post_context = num; + case 'B': + opt.pre_context = num; + break; + } + continue; + } + } + *dst++ = arg; + } + if (!opt.pattern) + die("no pattern given."); + + err = regcomp(&opt.regexp, opt.pattern, opt.regflags); + if (err) { + char errbuf[1024]; + regerror(err, &opt.regexp, errbuf, 1024); + regfree(&opt.regexp); + die("'%s': %s", opt.pattern, errbuf); + } + + init_revisions(&rev); + *dst = NULL; + argc = setup_revisions(dst - argv, argv, &rev, NULL); + + /* + * Do not walk "grep -e foo master next pu -- Documentation/" + * but do walk "grep -e foo master..next -- Documentation/". + * Ranged request mixed with a blob or tree object, like + * "grep -e foo v1.0.0:Documentation/ master..next" + * so detect that and complain. + */ + for (list = rev.pending_objects; list; list = list->next) { + struct object *real_obj; + if (list->item->flags & UNINTERESTING) + seen_range = 1; + real_obj = deref_tag(list->item, NULL, 0); + if (strcmp(real_obj->type, commit_type)) + seen_noncommit = 1; + } + if (!rev.pending_objects) + return !grep_cache(&opt, &rev, cached); + if (cached) + die("both --cached and revisions given."); + + if (seen_range && seen_noncommit) + die("both A..B and non commit are given."); + if (seen_range) { + struct commit *commit; + prepare_revision_walk(&rev); + while ((commit = get_revision(&rev)) != NULL) { + unsigned char *sha1 = commit->object.sha1; + const char *n = find_unique_abbrev(sha1, rev.abbrev); + char rev_name[41]; + strcpy(rev_name, n); + if (grep_object(&opt, &rev, &commit->object, rev_name)) + hit = 1; + commit->buffer = NULL; + } + return !hit; + } + + /* all of them are non-commit; do not walk, and + * do not lose their names. + */ + for (list = rev.pending_objects; list; list = list->next) { + struct object *real_obj; + real_obj = deref_tag(list->item, NULL, 0); + if (grep_object(&opt, &rev, real_obj, list->name)) + hit = 1; + } + return !hit; +} diff --git a/builtin.h b/builtin.h index 47408a0585..cf5de3b931 100644 --- a/builtin.h +++ b/builtin.h @@ -19,5 +19,6 @@ extern int cmd_version(int argc, const char **argv, char **envp); extern int cmd_whatchanged(int argc, const char **argv, char **envp); extern int cmd_show(int argc, const char **argv, char **envp); extern int cmd_log(int argc, const char **argv, char **envp); +extern int cmd_grep(int argc, const char **argv, char **envp); #endif diff --git a/git.c b/git.c index 01b7e28b8c..893bddd768 100644 --- a/git.c +++ b/git.c @@ -46,6 +46,7 @@ static void handle_internal_command(int argc, const char **argv, char **envp) { "log", cmd_log }, { "whatchanged", cmd_whatchanged }, { "show", cmd_show }, + { "grep", cmd_grep }, }; int i; -- cgit v1.2.3