aboutsummaryrefslogtreecommitdiffstats
path: root/diffcore-pickaxe.c
blob: 401eb72c619d432c92fc66b470fae778117bcace (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/*
 * Copyright (C) 2005 Junio C Hamano
 * Copyright (C) 2010 Google Inc.
 */
#include "cache.h"
#include "diff.h"
#include "diffcore.h"
#include "xdiff-interface.h"
#include "kwset.h"

typedef int (*pickaxe_fn)(mmfile_t *one, mmfile_t *two,
			  struct diff_options *o,
			  regex_t *regexp, kwset_t kws);

static int pickaxe_match(struct diff_filepair *p, struct diff_options *o,
			 regex_t *regexp, kwset_t kws, pickaxe_fn fn);

static void pickaxe(struct diff_queue_struct *q, struct diff_options *o,
		    regex_t *regexp, kwset_t kws, pickaxe_fn fn)
{
	int i;
	struct diff_queue_struct outq;

	DIFF_QUEUE_CLEAR(&outq);

	if (o->pickaxe_opts & DIFF_PICKAXE_ALL) {
		/* Showing the whole changeset if needle exists */
		for (i = 0; i < q->nr; i++) {
			struct diff_filepair *p = q->queue[i];
			if (pickaxe_match(p, o, regexp, kws, fn))
				return; /* do not munge the queue */
		}

		/*
		 * Otherwise we will clear the whole queue by copying
		 * the empty outq at the end of this function, but
		 * first clear the current entries in the queue.
		 */
		for (i = 0; i < q->nr; i++)
			diff_free_filepair(q->queue[i]);
	} else {
		/* Showing only the filepairs that has the needle */
		for (i = 0; i < q->nr; i++) {
			struct diff_filepair *p = q->queue[i];
			if (pickaxe_match(p, o, regexp, kws, fn))
				diff_q(&outq, p);
			else
				diff_free_filepair(p);
		}
	}

	free(q->queue);
	*q = outq;
}

struct diffgrep_cb {
	regex_t *regexp;
	int hit;
};

static void diffgrep_consume(void *priv, char *line, unsigned long len)
{
	struct diffgrep_cb *data = priv;
	regmatch_t regmatch;
	int hold;

	if (line[0] != '+' && line[0] != '-')
		return;
	if (data->hit)
		/*
		 * NEEDSWORK: we should have a way to terminate the
		 * caller early.
		 */
		return;
	/* Yuck -- line ought to be "const char *"! */
	hold = line[len];
	line[len] = '\0';
	data->hit = !regexec(data->regexp, line + 1, 1, &regmatch, 0);
	line[len] = hold;
}

static int diff_grep(mmfile_t *one, mmfile_t *two,
		     struct diff_options *o,
		     regex_t *regexp, kwset_t kws)
{
	regmatch_t regmatch;
	struct diffgrep_cb ecbdata;
	xpparam_t xpp;
	xdemitconf_t xecfg;

	if (!one)
		return !regexec(regexp, two->ptr, 1, &regmatch, 0);
	if (!two)
		return !regexec(regexp, one->ptr, 1, &regmatch, 0);

	/*
	 * We have both sides; need to run textual diff and see if
	 * the pattern appears on added/deleted lines.
	 */
	memset(&xpp, 0, sizeof(xpp));
	memset(&xecfg, 0, sizeof(xecfg));
	ecbdata.regexp = regexp;
	ecbdata.hit = 0;
	xecfg.ctxlen = o->context;
	xecfg.interhunkctxlen = o->interhunkcontext;
	xdi_diff_outf(one, two, diffgrep_consume, &ecbdata,
		      &xpp, &xecfg);
	return ecbdata.hit;
}

static void diffcore_pickaxe_grep(struct diff_options *o)
{
	int err;
	regex_t regex;
	int cflags = REG_EXTENDED | REG_NEWLINE;

	if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE))
		cflags |= REG_ICASE;

	err = regcomp(&regex, o->pickaxe, cflags);
	if (err) {
		char errbuf[1024];
		regerror(err, &regex, errbuf, 1024);
		regfree(&regex);
		die("invalid regex: %s", errbuf);
	}

	pickaxe(&diff_queued_diff, o, &regex, NULL, diff_grep);

	regfree(&regex);
	return;
}

static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
{
	unsigned int cnt;
	unsigned long sz;
	const char *data;

	sz = mf->size;
	data = mf->ptr;
	cnt = 0;

	if (regexp) {
		regmatch_t regmatch;
		int flags = 0;

		assert(data[sz] == '\0');
		while (*data && !regexec(regexp, data, 1, &regmatch, flags)) {
			flags |= REG_NOTBOL;
			data += regmatch.rm_eo;
			if (*data && regmatch.rm_so == regmatch.rm_eo)
				data++;
			cnt++;
		}

	} else { /* Classic exact string match */
		while (sz) {
			struct kwsmatch kwsm;
			size_t offset = kwsexec(kws, data, sz, &kwsm);
			const char *found;
			if (offset == -1)
				break;
			else
				found = data + offset;
			sz -= found - data + kwsm.size[0];
			data = found + kwsm.size[0];
			cnt++;
		}
	}
	return cnt;
}

static int has_changes(mmfile_t *one, mmfile_t *two,
		       struct diff_options *o,
		       regex_t *regexp, kwset_t kws)
{
	unsigned int one_contains = one ? contains(one, regexp, kws) : 0;
	unsigned int two_contains = two ? contains(two, regexp, kws) : 0;
	return one_contains != two_contains;
}

static int pickaxe_match(struct diff_filepair *p, struct diff_options *o,
			 regex_t *regexp, kwset_t kws, pickaxe_fn fn)
{
	struct userdiff_driver *textconv_one = NULL;
	struct userdiff_driver *textconv_two = NULL;
	mmfile_t mf1, mf2;
	int ret;

	if (!o->pickaxe[0])
		return 0;

	/* ignore unmerged */
	if (!DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two))
		return 0;

	if (DIFF_OPT_TST(o, ALLOW_TEXTCONV)) {
		textconv_one = get_textconv(p->one);
		textconv_two = get_textconv(p->two);
	}

	/*
	 * If we have an unmodified pair, we know that the count will be the
	 * same and don't even have to load the blobs. Unless textconv is in
	 * play, _and_ we are using two different textconv filters (e.g.,
	 * because a pair is an exact rename with different textconv attributes
	 * for each side, which might generate different content).
	 */
	if (textconv_one == textconv_two && diff_unmodified_pair(p))
		return 0;

	mf1.size = fill_textconv(textconv_one, p->one, &mf1.ptr);
	mf2.size = fill_textconv(textconv_two, p->two, &mf2.ptr);

	ret = fn(DIFF_FILE_VALID(p->one) ? &mf1 : NULL,
		 DIFF_FILE_VALID(p->two) ? &mf2 : NULL,
		 o, regexp, kws);

	if (textconv_one)
		free(mf1.ptr);
	if (textconv_two)
		free(mf2.ptr);
	diff_free_filespec_data(p->one);
	diff_free_filespec_data(p->two);

	return ret;
}

static void diffcore_pickaxe_count(struct diff_options *o)
{
	const char *needle = o->pickaxe;
	int opts = o->pickaxe_opts;
	unsigned long len = strlen(needle);
	regex_t regex, *regexp = NULL;
	kwset_t kws = NULL;

	if (opts & DIFF_PICKAXE_REGEX) {
		int err;
		err = regcomp(&regex, needle, REG_EXTENDED | REG_NEWLINE);
		if (err) {
			/* The POSIX.2 people are surely sick */
			char errbuf[1024];
			regerror(err, &regex, errbuf, 1024);
			regfree(&regex);
			die("invalid regex: %s", errbuf);
		}
		regexp = &regex;
	} else {
		kws = kwsalloc(DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE)
			       ? tolower_trans_tbl : NULL);
		kwsincr(kws, needle, len);
		kwsprep(kws);
	}

	pickaxe(&diff_queued_diff, o, regexp, kws, has_changes);

	if (opts & DIFF_PICKAXE_REGEX)
		regfree(&regex);
	else
		kwsfree(kws);
	return;
}

void diffcore_pickaxe(struct diff_options *o)
{
	/* Might want to warn when both S and G are on; I don't care... */
	if (o->pickaxe_opts & DIFF_PICKAXE_KIND_G)
		diffcore_pickaxe_grep(o);
	else
		diffcore_pickaxe_count(o);
}
ask() is called. So at that point we know we're trying to prompt the user, and we can just die if ReadLine instantiation fails, rather than making this fake object to lazily delay showing the error. This should be OK even if there is no tty (e.g., we're in a cron job), because Term::ReadLine will return a stub object in that case whose "IN" and "OUT" functions return undef. And since 5906f54e47 (send-email: don't attempt to prompt if tty is closed, 2009-03-31), we check for that case and skip prompting. And we can be sure that FakeTerm was not kicking in for such a situation, because it has actually been broken since that commit! It does not define "IN" or "OUT" methods, so perl would barf with an error. If FakeTerm was in use, we were neither honoring what 5906f54e47 tried to do, nor producing the readable message that 280242d1cc intended. So we're better off just dropping FakeTerm entirely, and letting the error reported by constructing Term::ReadLine through. [jc: cherry-picked from v2.42.0-rc2~6^2~1] Signed-off-by: Jeff King <peff@peff.net> Acked-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2024-04-19Git 2.42.2v2.42.2Johannes Schindelin3-2/+9 Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19Git 2.41.1v2.41.1Johannes Schindelin3-2/+9 Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19Git 2.40.2v2.40.2Johannes Schindelin3-2/+9 Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19Git 2.39.4v2.39.4Johannes Schindelin3-2/+81 Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19fsck: warn about symlink pointing inside a gitdirJohannes Schindelin4-0/+117 In the wake of fixing a vulnerability where `git clone` mistakenly followed a symbolic link that it had just written while checking out files, writing into a gitdir, let's add some defense-in-depth by teaching `git fsck` to report symbolic links stored in its trees that point inside `.git/`. Even though the Git project never made any promises about the exact shape of the `.git/` directory's contents, there are likely repositories out there containing symbolic links that point inside the gitdir. For that reason, let's only report these as warnings, not as errors. Security-conscious users are encouraged to configure `fsck.symlinkPointsToGitDir = error`. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19core.hooksPath: add some protection while cloningJohannes Schindelin2-1/+27 Quite frequently, when vulnerabilities were found in Git's (quite complex) clone machinery, a relatively common way to escalate the severity was to trick Git into running a hook which is actually a script that has just been laid on disk as part of that clone. This constitutes a Remote Code Execution vulnerability, the highest severity observed in Git's vulnerabilities so far. Some previously-fixed vulnerabilities allowed malicious repositories to be crafted such that Git would check out files not in the worktree, but in, say, a submodule's `<git>/hooks/` directory. A vulnerability that "merely" allows to modify the Git config would allow a related attack vector, to manipulate Git into looking in the worktree for hooks, e.g. redirecting the location where Git looks for hooks, via setting `core.hooksPath` (which would be classified as CWE-427: Uncontrolled Search Path Element and CWE-114: Process Control, for more details see https://cwe.mitre.org/data/definitions/427.html and https://cwe.mitre.org/data/definitions/114.html). To prevent that attack vector, let's error out and complain loudly if an active `core.hooksPath` configuration is seen in the repository-local Git config during a `git clone`. There is one caveat: This changes Git's behavior in a slightly backwards-incompatible manner. While it is probably a rare scenario (if it exists at all) to configure `core.hooksPath` via a config in the Git templates, it _is_ conceivable that some valid setup requires this to work. In the hopefully very unlikely case that a user runs into this, there is an escape hatch: set the `GIT_CLONE_PROTECTION_ACTIVE=false` environment variable. Obviously, this should be done only with utmost caution. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19init.templateDir: consider this config setting protectedJohannes Schindelin2-7/+61 The ability to configuring the template directory is a delicate feature: It allows defining hooks that will be run e.g. during a `git clone` operation, such as the `post-checkout` hook. As such, it is of utmost importance that Git would not allow that config setting to be changed during a `git clone` by mistake, allowing an attacker a chance for a Remote Code Execution, allowing attackers to run arbitrary code on unsuspecting users' machines. As a defense-in-depth measure, to prevent minor vulnerabilities in the `git clone` code from ballooning into higher-serverity attack vectors, let's make this a protected setting just like `safe.directory` and friends, i.e. ignore any `init.templateDir` entries from any local config. Note: This does not change the behavior of any recursive clone (modulo bugs), as the local repository config is not even supposed to be written while cloning the superproject, except in one scenario: If a config template is configured that sets the template directory. This might be done because `git clone --recurse-submodules --template=<directory>` does not pass that template directory on to the submodules' initialization. Another scenario where this commit changes behavior is where repositories are _not_ cloned recursively, and then some (intentional, benign) automation configures the template directory to be used before initializing the submodules. So the caveat is that this could theoretically break existing processes. In both scenarios, there is a way out, though: configuring the template directory via the environment variable `GIT_TEMPLATE_DIR`. This change in behavior is a trade-off between security and backwards-compatibility that is struck in favor of security. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19clone: prevent hooks from running during a cloneJohannes Schindelin3-1/+94 Critical security issues typically combine relatively common vulnerabilities such as case confusion in file paths with other weaknesses in order to raise the severity of the attack. One such weakness that has haunted the Git project in many a submodule-related CVE is that any hooks that are found are executed during a clone operation. Examples are the `post-checkout` and `fsmonitor` hooks. However, Git's design calls for hooks to be disabled by default, as only disabled example hooks are copied over from the templates in `<prefix>/share/git-core/templates/`. As a defense-in-depth measure, let's prevent those hooks from running. Obviously, administrators can choose to drop enabled hooks into the template directory, though, _and_ it is also possible to override `core.hooksPath`, in which case the new check needs to be disabled. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-19Add a helper function to compare file contentsJohannes Schindelin4-0/+123 In the next commit, Git will learn to disallow hooks during `git clone` operations _except_ when those hooks come from the templates (which are inherently supposed to be trusted). To that end, we add a function to compare the contents of two files. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17init: refactor the template directory discovery into its own functionJohannes Schindelin3-18/+37 We will need to call this function from `hook.c` to be able to prevent hooks from running that were written as part of a `clone` but did not originate from the template directory. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17find_hook(): refactor the `STRIP_EXTENSION` logicJohannes Schindelin1-7/+11 When looking for a hook and not finding one, and when `STRIP_EXTENSION` is available (read: if we're on Windows and `.exe` is the required extension for executable programs), we want to look also for a hook with that extension. Previously, we added that handling into the conditional block that was meant to handle when no hook was found (possibly providing some advice for the user's benefit). If the hook with that file extension was found, we'd return early from that function instead of writing out said advice, of course. However, we're about to introduce a safety valve to prevent hooks from being run during a clone, to reduce the attack surface of bugs that allow writing files to be written into arbitrary locations. To prepare for that, refactor the logic to avoid the early return, by separating the `STRIP_EXTENSION` handling from the conditional block handling the case when no hook was found. This commit is best viewed with `--patience`. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17clone: when symbolic links collide with directories, keep the latterJohannes Schindelin3-2/+31 When recursively cloning a repository with submodules, we must ensure that the submodules paths do not suddenly contain symbolic links that would let Git write into unintended locations. We just plugged that vulnerability, but let's add some more defense-in-depth. Since we can only keep one item on disk if multiple index entries' paths collide, we may just as well avoid keeping a symbolic link (because that would allow attack vectors where Git follows those links by mistake). Technically, we handle more situations than cloning submodules into paths that were (partially) replaced by symbolic links. This provides defense-in-depth in case someone finds a case-folding confusion vulnerability in the future that does not even involve submodules. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17entry: report more colliding pathsJohannes Schindelin3-1/+20 In b878579ae7 (clone: report duplicate entries on case-insensitive filesystems, 2018-08-17) code was added to warn about index entries that resolve to the same file system entity (usually the cause is a case-insensitive filesystem). In Git for Windows, where inodes are not trusted (because of a performance trade-off, inodes are equal to 0 by default), that check does not compare inode numbers but the verbatim path. This logic works well when index entries' paths differ only in case. However, for file/directory conflicts only the file's path was reported, leaving the user puzzled with what that path collides. Let's try ot catch colliding paths even if one path is the prefix of the other. We do this also in setups where the file system is case-sensitive because the inode check would not be able to catch those collisions. While not a complete solution (for example, on macOS, Unicode normalization could also lead to file/directory conflicts but be missed by this logic), it is at least another defensive layer on top of what the previous commits added. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17t5510: verify that D/F confusion cannot lead to an RCEJohannes Schindelin1-0/+24 The most critical vulnerabilities in Git lead to a Remote Code Execution ("RCE"), i.e. the ability for an attacker to have malicious code being run as part of a Git operation that is not expected to run said code, such has hooks delivered as part of a `git clone`. A couple of parent commits ago, a bug was fixed that let Git be confused by the presence of a path `a-` to mistakenly assume that a directory `a/` can safely be created without removing an existing `a` that is a symbolic link. This bug did not represent an exploitable vulnerability on its own; Let's make sure it stays that way. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17submodule: require the submodule path to contain directories onlyJohannes Schindelin4-5/+113 Submodules are stored in subdirectories of their superproject. When these subdirectories have been replaced with symlinks by a malicious actor, all kinds of mayhem can be caused. This _should_ not be possible, but many CVEs in the past showed that _when_ possible, it allows attackers to slip in code that gets executed during, say, a `git clone --recursive` operation. Let's add some defense-in-depth to disallow submodule paths to have anything except directories in them. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17clone_submodule: avoid using `access()` on directoriesJohannes Schindelin1-1/+1 In 0060fd1511b (clone --recurse-submodules: prevent name squatting on Windows, 2019-09-12), I introduced code to verify that a git dir either does not exist, or is at least empty, to fend off attacks where an inadvertently (and likely maliciously) pre-populated git dir would be used while cloning submodules recursively. The logic used `access(<path>, X_OK)` to verify that a directory exists before calling `is_empty_dir()` on it. That is a curious way to check for a directory's existence and might well fail for unwanted reasons. Even the original author (it was I ;-) ) struggles to explain why this function was used rather than `stat()`. This code was _almost_ copypastad in the previous commit, but that `access()` call was caught during review. Let's use `stat()` instead also in the code that was almost copied verbatim. Let's not use `lstat()` because in the unlikely event that somebody snuck a symbolic link in, pointing to a crafted directory, we want to verify that that directory is empty. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17submodules: submodule paths must not contain symlinksJohannes Schindelin2-0/+83 When creating a submodule path, we must be careful not to follow symbolic links. Otherwise we may follow a symbolic link pointing to a gitdir (which are valid symbolic links!) e.g. while cloning. On case-insensitive filesystems, however, we blindly replace a directory that has been created as part of the `clone` operation with a symlink when the path to the latter differs only in case from the former's path. Let's simply avoid this situation by expecting not ever having to overwrite any existing file/directory/symlink upon cloning. That way, we won't even replace a directory that we just created. This addresses CVE-2024-32002. Reported-by: Filip Hejsek <filip.hejsek@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> 2024-04-17clone: prevent clashing git dirs when cloning submodule in parallelFilip Hejsek3-2/+66 While it is expected to have several git dirs within the `.git/modules/` tree, it is important that they do not interfere with each other. For example, if one submodule was called "captain" and another submodule "captain/hooks", their respective git dirs would clash, as they would be located in `.git/modules/captain/` and `.git/modules/captain/hooks/`, respectively, i.e. the latter's files could clash with the actual Git hooks of the former. To prevent these clashes, and in particular to prevent hooks from being written and then executed as part of a recursive clone, we introduced checks as part of the fix for CVE-2019-1387 in a8dee3ca61 (Disallow dubiously-nested submodule git directories, 2019-10-01). It is currently possible to bypass the check for clashing submodule git dirs in two ways: 1. parallel cloning 2. checkout --recurse-submodules Let's check not only before, but also after parallel cloning (and before checking out the submodule), that the git dir is not clashing with another one, otherwise fail. This addresses the parallel cloning issue. As to the parallel checkout issue: It requires quite a few manual steps to create clashing git dirs because Git itself would refuse to initialize the inner one, as demonstrated by the test case. Nevertheless, let's teach the recursive checkout (namely, the `submodule_move_head()` function that is used by the recursive checkout) to be careful to verify that it does not use a clashing git dir, and if it does, disable it (by deleting the `HEAD` file so that subsequent Git calls won't recognize it as a git dir anymore). Note: The parallel cloning test case contains a `cat err` that proved to be highly useful when analyzing the racy nature of the operation (the operation can fail with three different error messages, depending on timing), and was left on purpose to ease future debugging should the need arise. Signed-off-by: Filip Hejsek <filip.hejsek@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>