diff options
Diffstat (limited to 'Documentation/technical')
| -rw-r--r-- | Documentation/technical/.gitignore | 1 | ||||
| -rw-r--r-- | Documentation/technical/api-error-handling.adoc (renamed from Documentation/technical/api-error-handling.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/api-index-skel.adoc (renamed from Documentation/technical/api-index-skel.txt) | 0 | ||||
| -rwxr-xr-x | Documentation/technical/api-index.sh | 27 | ||||
| -rw-r--r-- | Documentation/technical/api-merge.adoc (renamed from Documentation/technical/api-merge.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/api-parse-options.adoc (renamed from Documentation/technical/api-parse-options.txt) | 10 | ||||
| -rw-r--r-- | Documentation/technical/api-path-walk.adoc | 81 | ||||
| -rw-r--r-- | Documentation/technical/api-simple-ipc.adoc (renamed from Documentation/technical/api-simple-ipc.txt) | 2 | ||||
| -rw-r--r-- | Documentation/technical/api-trace2.adoc (renamed from Documentation/technical/api-trace2.txt) | 2 | ||||
| -rw-r--r-- | Documentation/technical/bitmap-format.adoc (renamed from Documentation/technical/bitmap-format.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/build-systems.adoc | 227 | ||||
| -rw-r--r-- | Documentation/technical/bundle-uri.adoc (renamed from Documentation/technical/bundle-uri.txt) | 14 | ||||
| -rw-r--r-- | Documentation/technical/commit-graph.adoc (renamed from Documentation/technical/commit-graph.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/directory-rename-detection.adoc (renamed from Documentation/technical/directory-rename-detection.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/hash-function-transition.adoc (renamed from Documentation/technical/hash-function-transition.txt) | 6 | ||||
| -rw-r--r-- | Documentation/technical/large-object-promisors.adoc | 656 | ||||
| -rw-r--r-- | Documentation/technical/long-running-process-protocol.adoc (renamed from Documentation/technical/long-running-process-protocol.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/meson.build | 67 | ||||
| -rw-r--r-- | Documentation/technical/multi-pack-index.adoc (renamed from Documentation/technical/multi-pack-index.txt) | 82 | ||||
| -rw-r--r-- | Documentation/technical/pack-heuristics.adoc (renamed from Documentation/technical/pack-heuristics.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/packfile-uri.adoc (renamed from Documentation/technical/packfile-uri.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/parallel-checkout.adoc (renamed from Documentation/technical/parallel-checkout.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/partial-clone.adoc (renamed from Documentation/technical/partial-clone.txt) | 4 | ||||
| -rw-r--r-- | Documentation/technical/platform-support.adoc (renamed from Documentation/technical/platform-support.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/racy-git.adoc (renamed from Documentation/technical/racy-git.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/reftable.adoc (renamed from Documentation/technical/reftable.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/remembering-renames.adoc (renamed from Documentation/technical/remembering-renames.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/repository-version.adoc (renamed from Documentation/technical/repository-version.txt) | 44 | ||||
| -rw-r--r-- | Documentation/technical/rerere.adoc (renamed from Documentation/technical/rerere.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/scalar.adoc (renamed from Documentation/technical/scalar.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/send-pack-pipeline.adoc (renamed from Documentation/technical/send-pack-pipeline.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/shallow.adoc (renamed from Documentation/technical/shallow.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/sparse-checkout.adoc (renamed from Documentation/technical/sparse-checkout.txt) | 6 | ||||
| -rw-r--r-- | Documentation/technical/sparse-index.adoc (renamed from Documentation/technical/sparse-index.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/trivial-merge.adoc (renamed from Documentation/technical/trivial-merge.txt) | 0 | ||||
| -rw-r--r-- | Documentation/technical/unit-tests.adoc (renamed from Documentation/technical/unit-tests.txt) | 0 |
36 files changed, 1148 insertions, 81 deletions
diff --git a/Documentation/technical/.gitignore b/Documentation/technical/.gitignore index 8aa891daee..3caef14a93 100644 --- a/Documentation/technical/.gitignore +++ b/Documentation/technical/.gitignore @@ -1 +1,2 @@ api-index.txt +api-index.adoc diff --git a/Documentation/technical/api-error-handling.txt b/Documentation/technical/api-error-handling.adoc index 665c4960b4..665c4960b4 100644 --- a/Documentation/technical/api-error-handling.txt +++ b/Documentation/technical/api-error-handling.adoc diff --git a/Documentation/technical/api-index-skel.txt b/Documentation/technical/api-index-skel.adoc index 7780a76b08..7780a76b08 100644 --- a/Documentation/technical/api-index-skel.txt +++ b/Documentation/technical/api-index-skel.adoc diff --git a/Documentation/technical/api-index.sh b/Documentation/technical/api-index.sh index 9c3f4131b8..dd206b1ca4 100755 --- a/Documentation/technical/api-index.sh +++ b/Documentation/technical/api-index.sh @@ -1,28 +1,39 @@ #!/bin/sh +if test $# -ne 2 +then + echo >&2 "USAGE: $0 <SOURCE_DIR> <OUTPUT>" + exit 1 +fi + +SOURCE_DIR="$1" +OUTPUT="$2" + ( + cd "$SOURCE_DIR" + c=//////////////////////////////////////////////////////////////// - skel=api-index-skel.txt + skel=api-index-skel.adoc sed -e '/^\/\/ table of contents begin/q' "$skel" echo "$c" - ls api-*.txt | + ls api-*.adoc | while read filename do case "$filename" in - api-index-skel.txt | api-index.txt) continue ;; + api-index-skel.adoc | api-index.adoc) continue ;; esac title=$(sed -e 1q "$filename") - html=${filename%.txt}.html + html=${filename%.adoc}.html echo "* link:$html[$title]" done echo "$c" sed -n -e '/^\/\/ table of contents end/,$p' "$skel" -) >api-index.txt+ +) >"$OUTPUT"+ -if test -f api-index.txt && cmp api-index.txt api-index.txt+ >/dev/null +if test -f "$OUTPUT" && cmp "$OUTPUT" "$OUTPUT"+ >/dev/null then - rm -f api-index.txt+ + rm -f "$OUTPUT"+ else - mv api-index.txt+ api-index.txt + mv "$OUTPUT"+ "$OUTPUT" fi diff --git a/Documentation/technical/api-merge.txt b/Documentation/technical/api-merge.adoc index c2ba01828c..c2ba01828c 100644 --- a/Documentation/technical/api-merge.txt +++ b/Documentation/technical/api-merge.adoc diff --git a/Documentation/technical/api-parse-options.txt b/Documentation/technical/api-parse-options.adoc index 61fa6ee167..880eb94642 100644 --- a/Documentation/technical/api-parse-options.txt +++ b/Documentation/technical/api-parse-options.adoc @@ -211,11 +211,13 @@ There are some macros to easily define options: Use of `--no-option` will clear the list of preceding values. `OPT_INTEGER(short, long, &int_var, description)`:: - Introduce an option with integer argument. - The integer is put into `int_var`. + Introduce an option with integer argument. The argument must be a + integer and may include a suffix of 'k', 'm' or 'g' to + scale the provided value by 1024, 1024^2 or 1024^3 respectively. + The scaled value is put into `int_var`. -`OPT_MAGNITUDE(short, long, &unsigned_long_var, description)`:: - Introduce an option with a size argument. The argument must be a +`OPT_UNSIGNED(short, long, &unsigned_long_var, description)`:: + Introduce an option with an unsigned integer argument. The argument must be a non-negative integer and may include a suffix of 'k', 'm' or 'g' to scale the provided value by 1024, 1024^2 or 1024^3 respectively. The scaled value is put into `unsigned_long_var`. diff --git a/Documentation/technical/api-path-walk.adoc b/Documentation/technical/api-path-walk.adoc new file mode 100644 index 0000000000..34c905eb9c --- /dev/null +++ b/Documentation/technical/api-path-walk.adoc @@ -0,0 +1,81 @@ +Path-Walk API +============= + +The path-walk API is used to walk reachable objects, but to visit objects +in batches based on a common path they appear in, or by type. + +For example, all reachable commits are visited in a group. All tags are +visited in a group. Then, all root trees are visited. At some point, all +blobs reachable via a path `my/dir/to/A` are visited. When there are +multiple paths possible to reach the same object, then only one of those +paths is used to visit the object. + +Basics +------ + +To use the path-walk API, include `path-walk.h` and call +`walk_objects_by_path()` with a customized `path_walk_info` struct. The +struct is used to set all of the options for how the walk should proceed. +Let's dig into the different options and their use. + +`path_fn` and `path_fn_data`:: + The most important option is the `path_fn` option, which is a + function pointer to the callback that can execute logic on the + object IDs for objects grouped by type and path. This function + also receives a `data` value that corresponds to the + `path_fn_data` member, for providing custom data structures to + this callback function. + +`revs`:: + To configure the exact details of the reachable set of objects, + use the `revs` member and initialize it using the revision + machinery in `revision.h`. Initialize `revs` using calls such as + `setup_revisions()` or `parse_revision_opt()`. Do not call + `prepare_revision_walk()`, as that will be called within + `walk_objects_by_path()`. ++ +It is also important that you do not specify the `--objects` flag for the +`revs` struct. The revision walk should only be used to walk commits, and +the objects will be walked in a separate way based on those starting +commits. + +`commits`, `blobs`, `trees`, `tags`:: + By default, these members are enabled and signal that the path-walk + API should call the `path_fn` on objects of these types. Specialized + applications could disable some options to make it simpler to walk + the objects or to have fewer calls to `path_fn`. ++ +While it is possible to walk only commits in this way, consumers would be +better off using the revision walk API instead. + +`prune_all_uninteresting`:: + By default, all reachable paths are emitted by the path-walk API. + This option allows consumers to declare that they are not + interested in paths where all included objects are marked with the + `UNINTERESTING` flag. This requires using the `boundary` option in + the revision walk so that the walk emits commits marked with the + `UNINTERESTING` flag. + +`edge_aggressive`:: + For performance reasons, usually only the boundary commits are + explored to find UNINTERESTING objects. However, in the case of + shallow clones it can be helpful to mark all trees and blobs + reachable from UNINTERESTING tip commits as UNINTERESTING. This + matches the behavior of `--objects-edge-aggressive` in the + revision API. + +`pl`:: + This pattern list pointer allows focusing the path-walk search to + a set of patterns, only emitting paths that match the given + patterns. See linkgit:gitignore[5] or + linkgit:git-sparse-checkout[1] for details about pattern lists. + When the pattern list uses cone-mode patterns, then the path-walk + API can prune the set of paths it walks to improve performance. + +Examples +-------- + +See example usages in: + `t/helper/test-path-walk.c`, + `builtin/pack-objects.c`, + `builtin/backfill.c` diff --git a/Documentation/technical/api-simple-ipc.txt b/Documentation/technical/api-simple-ipc.adoc index c4fb152b23..972178b042 100644 --- a/Documentation/technical/api-simple-ipc.txt +++ b/Documentation/technical/api-simple-ipc.adoc @@ -36,7 +36,7 @@ Comparison with sub-process model --------------------------------- The Simple-IPC mechanism differs from the existing `sub-process.c` -model (Documentation/technical/long-running-process-protocol.txt) and +model (Documentation/technical/long-running-process-protocol.adoc) and used by applications like Git-LFS. In the LFS-style sub-process model, the helper is started by the foreground process, communication happens via a pair of file descriptors bound to the stdin/stdout of the diff --git a/Documentation/technical/api-trace2.txt b/Documentation/technical/api-trace2.adoc index 5817b18310..cf493dae03 100644 --- a/Documentation/technical/api-trace2.txt +++ b/Documentation/technical/api-trace2.adoc @@ -140,7 +140,7 @@ $ cat ~/log.event To enable a target, set the corresponding environment variable or system or global config value to one of the following: -include::../trace2-target-values.txt[] +include::../trace2-target-values.adoc[] When trace files are written to a target directory, they will be named according to the last component of the SID (optionally followed by a counter to avoid diff --git a/Documentation/technical/bitmap-format.txt b/Documentation/technical/bitmap-format.adoc index bfb0ec7beb..bfb0ec7beb 100644 --- a/Documentation/technical/bitmap-format.txt +++ b/Documentation/technical/bitmap-format.adoc diff --git a/Documentation/technical/build-systems.adoc b/Documentation/technical/build-systems.adoc new file mode 100644 index 0000000000..3c5237b9fd --- /dev/null +++ b/Documentation/technical/build-systems.adoc @@ -0,0 +1,227 @@ += Build Systems + +The build system is the primary way for both developers and system integrators +to interact with the Git project. As such, being easy to use and extend for +those who are not directly developing Git itself is just as important as other +requirements we have on any potential build system. + +This document outlines the different requirements that we have for the build +system and then compares available build systems using these criteria. + +== Requirements + +The following subsections present a list of requirements that we have for any +potential build system. Sections are sorted by decreasing priority. + +=== Platform support + +The build system must have support for all of our platforms that we continually +test against as outlined by our platform support policy. These platforms are: + + - Linux + - Windows + - macOS + +Furthermore, the build system should have support for the following platforms +that generally have somebody running test pipelines against regularly: + + - AIX + - FreeBSD + - NetBSD + - NonStop + - OpenBSD + +The platforms which must be supported by the tool should be aligned with our +platform support policy (see platform-support.adoc). +// once we lose AsciiDoc compatibility, we can start writing the above as: +// xref:platform-support.adoc#platform-support-policy[platform support policy] +// or something like that, but until then.... + +=== Auto-detection of supported features + +The build system must support auto-detection of features which are or aren't +available on the current platform. Platform maintainers should not be required +to manually configure the complete build. + +Auto-detection of the following items is considered to be important: + + - Check for the existence of headers. + - Check for the existence of libraries. + - Check for the existence of exectuables. + - Check for the runtime behavior of specific functions. + - Check for specific link order requirements when multiple libraries are + involved. + +=== Ease of use + +The build system should be both easy to use and easy to extend. While this is +naturally a subjective metric it is likely not controversial to say that some +build systems are considerably harder to use than others. + +=== IDE support + +The build system should integrate with well-known IDEs. Well-known IDEs include: + + - Microsoft Visual Studio + - Visual Studio Code + - Xcode + +There are four levels of support: + + - Native integration into the IDE. + - Integration into the IDE via a plugin. + - Integration into the IDE via generating a project description with the build + system. + - No integration. + +Native integration is preferable, but integration via either a plugin or by +generating a project description via the build system are considered feasible +alternatives. + +Another important distinction is the level of integration. There are two +features that one generally wants to have: + + - Integration of build targets. + - Automatic setup of features like code completion with detected build + dependencies. + +The first bullet point is the bare minimum, but is not sufficient to be +considered proper integration. + +=== Out-of-tree builds + +The build system should support out-of-tree builds. Out-of-tree builds allow a +developer to configure multiple different build directories with different +configuration, e.g. one "debug" build and one "release" build. + +=== Cross-platform builds + +The build system should support cross-platform builds, e.g. building for arm on +an x86-64 host. + +=== Language support + +The following languages and toolchains are of relevance and should be supported +by the build system: + + - C: the primary compiled language used by Git, must be supported. Relevant + toolchains are GCC, Clang and MSVC. + - Rust: candidate as a second compiled lanugage, should be supported. Relevant + toolchains is the LLVM-based rustc. + +Built-in support for the respective languages is preferred over support that +needs to be wired up manually to avoid unnecessary complexity. Native support +includes the following features: + + - Compiling objects. + - Dependency tracking. + - Detection of available features. + - Discovery of relevant toolchains. + - Linking libraries and executables. + - Templating placeholders in scripts. + +=== Test integration + +It should be possible to integrate tests into the build system such that it is +possible to build and test Git within the build system. Features which are nice +to have: + + - Track build-time dependencies for respective tests. Unit tests have + different requirements than integration tests. + - Allow filtering of which tests to run. + - Allow running tests such that utilities like `test_pause` or `debug` work. + +== Comparison + +The following list of build systems are considered: + +- GNU Make +- autoconf +- CMake +- Meson + +=== GNU Make + +- Platform support: ubitquitous on all platforms, but not well-integrated into Windows. +- Auto-detection: no built-in support for auto-detection of features. +- Ease of use: easy to use, but discovering available options is hard. Makefile + rules can quickly get out of hand once reaching a certain scope. +- IDE support: execution of Makefile targets is supported by many IDEs +- Out-of-tree builds: supported in theory, not wired up in practice. +- Cross-platform builds: supported in theory, not wired up in practice. +- Language support: + - C: Limited built-in support, many parts need to be wired up manually. + - Rust: No built-in support, needs to be wired up manually. +- Test integration: partially supported, many parts need to be wired up + manually. + +=== autoconf + +- Platform support: ubiquitous on all platforms, but not well-integrated into Windows. +- Auto-detection: supported. +- Ease of use: easy to use, discovering available options is comparatively + easy. The autoconf syntax is prohibitively hard to extend though due to its + complex set of interacting files and the hard-to-understand M4 language. +- IDE support: no integration into IDEs at generation time. The generated + Makefiles have the same level of support as GNU Make. +- Out-of-tree builds: supported in theory, not wired up in practice. +- Cross-platform builds: supported. +- Language support: + - C: Limited built-in support, many parts need to be wired up manually. + - Rust: No built-in support, needs to be wired up manually. +- Test integration: partially supported, many parts need to be wired up + manually. + +=== CMake + +- Platform support: not as extensive as GNU Make or autoconf, but all major + platforms are supported. + - AIX + - Cygwin + - FreeBSD + - Linux + - OpenBSD + - Solaris + - Windows + - macOS +- Ease of use: easy to use, discovering available options is not always + trivial. The scripting language used by CMake is somewhat cumbersome to use, + but extending CMake build instructions is doable. +- IDE support: natively integrated into Microsoft Visual Studio. Can generate + project descriptions for Xcode. An extension is available for Visual Studio + Code. Many other IDEs have plugins for CMake. +- Out-of-tree builds: supported. +- Cross-platform builds: supported. +- Language support: + - C: Supported for GCC, Clang, MSVC and other toolchains. + - Rust: No built-in support, needs to be wired up manually. +- Test integration: supported, even though test dependencies are a bit + cumbersome to use via "test fixtures". Interactive test runs are not + supported. + +=== Meson + +- Platform: not as extensive as GNU Make or autoconf, but all major platforms + and some smaller ones are supported. + - AIX + - Cygwin + - DragonflyBSD + - FreeBSD + - Haiku + - Linux + - NetBSD + - OpenBSD + - Solaris + - Windows + - macOS +- Ease of use: easy to use, discovering available options is easy. The + scripting language is straight-forward to use. +- IDE support: Supports generating build instructions for Xcode and Microsoft + Visual Studio, a plugin exists for Visual Studio Code. +- Out-of-tree builds: supported. +- Cross-platform builds: supported. +- Language support: + - C: Supported for GCC, Clang, MSVC and other toolchains. + - Rust: Supported for rustc. +- Test integration: supported. Interactive tests are supported starting with + Meson 1.5.0 via the `--interactive` flag. diff --git a/Documentation/technical/bundle-uri.txt b/Documentation/technical/bundle-uri.adoc index 91d3a13e32..12283fa9ed 100644 --- a/Documentation/technical/bundle-uri.txt +++ b/Documentation/technical/bundle-uri.adoc @@ -232,13 +232,13 @@ will interact with bundle URIs according to the following flow: are present in the client repository. If some are missing, then the client delays unbundling until other bundles have been unbundled, making those OIDs present. When all required OIDs are present, the - client unbundles that data using a refspec. The default refspec is - `+refs/heads/*:refs/bundles/*`, but this can be configured. These refs - are stored so that later `git fetch` negotiations can communicate each - bundled ref as a `have`, reducing the size of the fetch over the Git - protocol. To allow pruning refs from this ref namespace, Git may - introduce a numbered namespace (such as `refs/bundles/<i>/*`) such that - stale bundle refs can be deleted. + client unbundles that data using a refspec. The refspec used is + `+refs/*:refs/bundles/*`. These refs are stored so that later + `git fetch` negotiations can communicate each bundled ref as a `have`, + reducing the size of the fetch over the Git protocol. To allow pruning + refs from this ref namespace, Git may introduce a numbered namespace + (such as `refs/bundles/<i>/*`) such that stale bundle refs can be + deleted. 3. If the file is instead a bundle list, then the client inspects the `bundle.mode` to see if the list is of the `all` or `any` form. diff --git a/Documentation/technical/commit-graph.txt b/Documentation/technical/commit-graph.adoc index 2c26e95e51..2c26e95e51 100644 --- a/Documentation/technical/commit-graph.txt +++ b/Documentation/technical/commit-graph.adoc diff --git a/Documentation/technical/directory-rename-detection.txt b/Documentation/technical/directory-rename-detection.adoc index 029ee2cedc..029ee2cedc 100644 --- a/Documentation/technical/directory-rename-detection.txt +++ b/Documentation/technical/directory-rename-detection.adoc diff --git a/Documentation/technical/hash-function-transition.txt b/Documentation/technical/hash-function-transition.adoc index ed57481089..f047fd80ca 100644 --- a/Documentation/technical/hash-function-transition.txt +++ b/Documentation/technical/hash-function-transition.adoc @@ -148,8 +148,8 @@ Detailed Design Repository format extension ~~~~~~~~~~~~~~~~~~~~~~~~~~~ A SHA-256 repository uses repository format version `1` (see -Documentation/technical/repository-version.txt) with extensions -`objectFormat` and `compatObjectFormat`: +linkgit:gitrepository-layout[5]) with `extensions.objectFormat` and +`extensions.compatObjectFormat` (see linkgit:git-config[1]) set to: [core] repositoryFormatVersion = 1 @@ -394,7 +394,7 @@ inflated again in step 3, for a total of two inflations. Step 4 is probably necessary for good read-time performance. "git pack-objects" on the server optimizes the pack file for good data -locality (see Documentation/technical/pack-heuristics.txt). +locality (see Documentation/technical/pack-heuristics.adoc). Details of this process are likely to change. It will take some experimenting to get this to perform well. diff --git a/Documentation/technical/large-object-promisors.adoc b/Documentation/technical/large-object-promisors.adoc new file mode 100644 index 0000000000..dea8dafa66 --- /dev/null +++ b/Documentation/technical/large-object-promisors.adoc @@ -0,0 +1,656 @@ +Large Object Promisors +====================== + +Since Git has been created, users have been complaining about issues +with storing large files in Git. Some solutions have been created to +help, but they haven't helped much with some issues. + +Git currently supports multiple promisor remotes, which could help +with some of these remaining issues, but it's very hard to use them to +help, because a number of important features are missing. + +The goal of the effort described in this document is to add these +important features. + +We will call a "Large Object Promisor", or "LOP" in short, a promisor +remote which is used to store only large blobs and which is separate +from the main remote that should store the other Git objects and the +rest of the repos. + +By extension, we will also call "Large Object Promisor", or LOP, the +effort described in this document to add a set of features to make it +easier to handle large blobs/files in Git by using LOPs. + +This effort aims to especially improve things on the server side, and +especially for large blobs that are already compressed in a binary +format. + +This effort aims to provide an alternative to Git LFS +(https://git-lfs.com/) and similar tools like git-annex +(https://git-annex.branchable.com/) for handling large files, even +though a complete alternative would very likely require other efforts +especially on the client side, where it would likely help to implement +a new object representation for large blobs as discussed in: + +https://lore.kernel.org/git/xmqqbkdometi.fsf@gitster.g/ + +0) Non goals +------------ + +- We will not discuss those client side improvements here, as they + would require changes in different parts of Git than this effort. ++ +So we don't pretend to fully replace Git LFS with only this effort, +but we nevertheless believe that it can significantly improve the +current situation on the server side, and that other separate +efforts could also improve the situation on the client side. + +- In the same way, we are not going to discuss all the possible ways + to implement a LOP or their underlying object storage, or to + optimize how LOP works. ++ +Our opinion is that the simplest solution for now is for LOPs to use +object storage through a remote helper (see section II.2 below for +more details) to store their objects. So we consider that this is the +default implementation. If there are improvements on top of this, +that's great, but our opinion is that such improvements are not +necessary for LOPs to already be useful. Such improvements are likely +a different technical topic, and can be taken care of separately +anyway. ++ +So in particular we are not going to discuss pluggable ODBs or other +object database backends that could chunk large blobs, dedup the +chunks and store them efficiently. Sure, that would be a nice +improvement to store large blobs on the server side, but we believe +it can just be a separate effort as it's also not technically very +related to this effort. ++ +We are also not going to discuss data transfer improvements between +LOPs and clients or servers. Sure, there might be some easy and very +effective optimizations there (as we know that objects on LOPs are +very likely incompressible and not deltifying well), but this can be +dealt with separately in a separate effort. + +In other words, the goal of this document is not to talk about all the +possible ways to optimize how Git could handle large blobs, but to +describe how a LOP based solution can already work well and alleviate +a number of current issues in the context of Git clients and servers +sharing Git objects. + +Even if LOPs are used not very efficiently, they can still be useful +and worth using in some cases, as we will see in more details +later in this document: + + - they can make it simpler for clients to use promisor remotes and + therefore avoid fetching a lot of large blobs they might not need + locally, + + - they can make it significantly cheaper or easier for servers to + host a significant part of the current repository content, and + even more to host content with larger blobs or more large blobs + than currently. + +I) Issues with the current situation +------------------------------------ + +- Some statistics made on GitLab repos have shown that more than 75% + of the disk space is used by blobs that are larger than 1MB and + often in a binary format. + +- So even if users could use Git LFS or similar tools to store a lot + of large blobs out of their repos, it's a fact that in practice they + don't do it as much as they probably should. + +- On the server side ideally, the server should be able to decide for + itself how it stores things. It should not depend on users deciding + to use tools like Git LFS on some blobs or not. + +- It's much more expensive to store large blobs that don't delta + compress well on regular fast seeking drives (like SSDs) than on + object storage (like Amazon S3 or GCP Buckets). Using fast drives + for regular Git repos makes sense though, as serving regular Git + content (blobs containing text or code) needs drives where seeking + is fast, but the content is relatively small. On the other hand, + object storage for Git LFS blobs makes sense as seeking speed is not + as important when dealing with large files, while costs are more + important. So the fact that users don't use Git LFS or similar tools + for a significant number of large blobs has likely some bad + consequences on the cost of repo storage for most Git hosting + platforms. + +- Having large blobs handled in the same way as other blobs and Git + objects in Git repos instead of on object storage also has a cost in + increased memory and CPU usage, and therefore decreased performance, + when creating packfiles. (This is because Git tries to use delta + compression or zlib compression which is unlikely to work well on + already compressed binary content.) So it's not just a storage cost + increase. + +- When a large blob has been committed into a repo, it might not be + possible to remove this blob from the repo without rewriting + history, even if the user then decides to use Git LFS or a similar + tool to handle it. + +- In fact Git LFS and similar tools are not very flexible in letting + users change their minds about the blobs they should handle or not. + +- Even when users are using Git LFS or similar tools, they are often + complaining that these tools require significant effort to set up, + learn and use correctly. + +II) Main features of the "Large Object Promisors" solution +---------------------------------------------------------- + +The main features below should give a rough overview of how the +solution may work. Details about needed elements can be found in +following sections. + +Even if each feature below is very useful for the full solution, it is +very likely to be also useful on its own in some cases where the full +solution is not required. However, we'll focus primarily on the big +picture here. + +Also each feature doesn't need to be implemented entirely in Git +itself. Some could be scripts, hooks or helpers that are not part of +the Git repo. It would be helpful if those could be shared and +improved on collaboratively though. So we want to encourage sharing +them. + +1) Large blobs are stored on LOPs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Large blobs should be stored on special promisor remotes that we will +call "Large Object Promisors" or LOPs. These LOPs should be additional +remotes dedicated to contain large blobs especially those in binary +format. They should be used along with main remotes that contain the +other objects. + +Note 1 +++++++ + +To clarify, a LOP is a normal promisor remote, except that: + +- it should store only large blobs, + +- it should be separate from the main remote, so that the main remote + can focus on serving other objects and the rest of the repos (see + feature 4) below) and can use the LOP as a promisor remote for + itself. + +Note 2 +++++++ + +Git already makes it possible for a main remote to also be a promisor +remote storing both regular objects and large blobs for a client that +clones from it with a filter on blob size. But here we explicitly want +to avoid that. + +Rationale ++++++++++ + +LOPs aim to be good at handling large blobs while main remotes are +already good at handling other objects. + +Implementation +++++++++++++++ + +Git already has support for multiple promisor remotes, see +link:partial-clone.html#using-many-promisor-remotes[the partial clone documentation]. + +Also, Git already has support for partial clone using a filter on the +size of the blobs (with `git clone --filter=blob:limit=<size>`). Most +of the other main features below are based on these existing features +and are about making them easy and efficient to use for the purpose of +better handling large blobs. + +2) LOPs can use object storage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +LOPs can be implemented using object storage, like an Amazon S3 or GCP +Bucket or MinIO (which is open source under the GNU AGPLv3 license) to +actually store the large blobs, and can be accessed through a Git +remote helper (see linkgit:gitremote-helpers[7]) which makes the +underlying object storage appear like a remote to Git. + +Note +++++ + +A LOP can be a promisor remote accessed using a remote helper by +both some clients and the main remote. + +Rationale ++++++++++ + +This looks like the simplest way to create LOPs that can cheaply +handle many large blobs. + +Implementation +++++++++++++++ + +Remote helpers are quite easy to write as shell scripts, but it might +be more efficient and maintainable to write them using other languages +like Go. + +Some already exist under open source licenses, for example: + + - https://github.com/awslabs/git-remote-s3 + - https://gitlab.com/eric.p.ju/git-remote-gs + +Other ways to implement LOPs are certainly possible, but the goal of +this document is not to discuss how to best implement a LOP or its +underlying object storage (see the "0) Non goals" section above). + +3) LOP object storage can be Git LFS storage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The underlying object storage that a LOP uses could also serve as +storage for large files handled by Git LFS. + +Rationale ++++++++++ + +This would simplify the server side if it wants to both use a LOP and +act as a Git LFS server. + +4) A main remote can offload to a LOP with a configurable threshold +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On the server side, a main remote should have a way to offload to a +LOP all its blobs with a size over a configurable threshold. + +Rationale ++++++++++ + +This makes it easy to set things up and to clean things up. For +example, an admin could use this to manually convert a repo not using +LOPs to a repo using a LOP. On a repo already using a LOP but where +some users would sometimes push large blobs, a cron job could use this +to regularly make sure the large blobs are moved to the LOP. + +Implementation +++++++++++++++ + +Using something based on `git repack --filter=...` to separate the +blobs we want to offload from the other Git objects could be a good +idea. The missing part is to connect to the LOP, check if the blobs we +want to offload are already there and if not send them. + +5) A main remote should try to remain clean from large blobs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A main remote should try to avoid containing a lot of oversize +blobs. For that purpose, it should offload as needed to a LOP and it +should have ways to prevent oversize blobs to be fetched, and also +perhaps pushed, into it. + +Rationale ++++++++++ + +A main remote containing many oversize blobs would defeat the purpose +of LOPs. + +Implementation +++++++++++++++ + +The way to offload to a LOP discussed in 4) above can be used to +regularly offload oversize blobs. About preventing oversize blobs from +being fetched into the repo see 6) below. About preventing oversize +blob pushes, a pre-receive hook could be used. + +Also there are different scenarios in which large blobs could get +fetched into the main remote, for example: + +- A client that doesn't implement the "promisor-remote" protocol + (described in 6) below) clones from the main remote. + +- The main remote gets a request for information about a large blob + and is not able to get that information without fetching the blob + from the LOP. + +It might not be possible to completely prevent all these scenarios +from happening. So the goal here should be to implement features that +make the fetching of large blobs less likely. For example adding a +`remote-object-info` command in the `git cat-file --batch` protocol +and its variants might make it possible for a main repo to respond to +some requests about large blobs without fetching them. + +6) A protocol negotiation should happen when a client clones +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When a client clones from a main repo, there should be a protocol +negotiation so that the server can advertise one or more LOPs and so +that the client and the server can discuss if the client could +directly use a LOP the server is advertising. If the client and the +server can agree on that, then the client would be able to get the +large blobs directly from the LOP and the server would not need to +fetch those blobs from the LOP to be able to serve the client. + +Note +++++ + +For fetches instead of clones, a protocol negotiation might not always +happen, see the "What about fetches?" FAQ entry below for details. + +Rationale ++++++++++ + +Security, configurability and efficiency of setting things up. + +Implementation +++++++++++++++ + +A "promisor-remote" protocol v2 capability looks like a good way to +implement this. The way the client and server use this capability +could be controlled by configuration variables. + +Information that the server could send to the client through that +protocol could be things like: LOP name, LOP URL, filter-spec (for +example `blob:limit=<size>`) or just size limit that should be used as +a filter when cloning, token to be used with the LOP, etc. + +7) A client can offload to a LOP +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When a client is using a LOP that is also a LOP of its main remote, +the client should be able to offload some large blobs it has fetched, +but might not need anymore, to the LOP. + +Note +++++ + +It might depend on the context if it should be OK or not for clients +to offload large blobs they have created, instead of fetched, directly +to the LOP without the main remote checking them in some ways +(possibly using hooks or other tools). + +This should be discussed and refined when we get closer to +implementing this feature. + +Rationale ++++++++++ + +On the client, the easiest way to deal with unneeded large blobs is to +offload them. + +Implementation +++++++++++++++ + +This is very similar to what 4) above is about, except on the client +side instead of the server side. So a good solution to 4) could likely +be adapted to work on the client side too. + +There might be some security issues here, as there is no negotiation, +but they might be mitigated if the client can reuse a token it got +when cloning (see 6) above). Also if the large blobs were fetched from +a LOP, it is likely, and can easily be confirmed, that the LOP still +has them, so that they can just be removed from the client. + +III) Benefits of using LOPs +--------------------------- + +Many benefits are related to the issues discussed in "I) Issues with +the current situation" above: + +- No need to rewrite history when deciding which blobs are worth + handling separately than other objects, or when moving or removing + the threshold. + +- If the protocol between client and server is developed and secured + enough, then many details might be setup on the server side only and + all the clients could then easily get all the configuration + information and use it to set themselves up mostly automatically. + +- Storage costs benefits on the server side. + +- Reduced memory and CPU needs on main remotes on the server side. + +- Reduced storage needs on the client side. + +IV) FAQ +------- + +What about using multiple LOPs on the server and client side? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +That could perhaps be useful in some cases, but for now it's more +likely that in most cases a single LOP will be advertised by the +server and should be used by the client. + +A case where it could be useful for a server to advertise multiple +LOPs is if a LOP is better for some users while a different LOP is +better for other users. For example some clients might have a better +connection to a LOP than others. + +In those cases it's the responsibility of the server to have some +documentation to help clients. It could say for example something like +"Users in this part of the world might want to pick only LOP A as it +is likely to be better connected to them, while users in other parts +of the world should pick only LOP B for the same reason." + +When should we trust or not trust the LOPs advertised by the server? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In some contexts, like in corporate setup where the server and all the +clients are parts of an internal network in a company where admins +have all the rights on every system, it's OK, and perhaps even a good +thing, if the clients fully trust the server, as it can help ensure +that all the clients are on the same page. + +There are also contexts in which clients trust a code hosting platform +serving them some repos, but might not fully trust other users +managing or contributing to some of these repos. For example, the code +hosting platform could have hooks in place to check that any object it +receives doesn't contain malware or otherwise bad content. In this +case it might be OK for the client to use a main remote and its LOP if +they are both hosted by the code hosting platform, but not if the LOP +is hosted elsewhere (where the content is not checked). + +In other contexts, a client should just not trust a server. + +So there should be different ways to configure how the client should +behave when a server advertises a LOP to it at clone time. + +As the basic elements that a server can advertise about a LOP are a +LOP name and a LOP URL, the client should base its decision about +accepting a LOP on these elements. + +One simple way to be very strict in the LOP it accepts is for example +for the client to check that the LOP is already configured on the +client with the same name and URL as what the server advertises. + +In general default and "safe" settings should require that the LOP are +configured on the client separately from the "promisor-remote" +protocol and that the client accepts a LOP only when information about +it from the protocol matches what has been already configured +separately. + +What about LOP names? +~~~~~~~~~~~~~~~~~~~~~ + +In some contexts, for example if the clients sometimes fetch from each +other, it can be a good idea for all the clients to use the same names +for all the remotes they use, including LOPs. + +In other contexts, each client might want to be able to give the name +it wants to each remote, including each LOP, it interacts with. + +So there should be different ways to configure how the client accepts +or not the LOP name the server advertises. + +If a default or "safe" setting is used, then as such a setting should +require that the LOP be configured separately, then the name would be +configured separately and there is no risk that the server could +dictate a name to a client. + +Could the main remote be bogged down by old or paranoid clients? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Yes, it could happen if there are too many clients that are either +unwilling to trust the main remote or that just don't implement the +"promisor-remote" protocol because they are too old or not fully +compatible with the 'git' client. + +When serving such a client, the main remote has no other choice than +to first fetch from its LOP, to then be able to provide to the client +everything it requested. So the main remote, even if it has cleanup +mechanisms (see section II.4 above), would be burdened at least +temporarily with the large blobs it had to fetch from its LOP. + +Not behaving like this would be breaking backward compatibility, and +could be seen as segregating clients. For example, it might be +possible to implement a special mode that allows the server to just +reject clients that don't implement the "promisor-remote" protocol or +aren't willing to trust the main remote. This mode might be useful in +a special context like a corporate environment. There is no plan to +implement such a mode though, and this should be discussed separately +later anyway. + +A better way to proceed is probably for the main remote to show a +message telling clients that don't implement the protocol or are +unwilling to accept the advertised LOP(s) that they would get faster +clone and fetches by upgrading client software or properly setting +them up to accept LOP(s). + +Waiting for clients to upgrade, monitoring these upgrades and limiting +the use of LOPs to repos that are not very frequently accessed might +be other good ways to make sure that some benefits are still reaped +from LOPs. Over time, as more and more clients upgrade and benefit +from LOPs, using them in more and more frequently accessed repos will +become worth it. + +Corporate environments, where it might be easier to make sure that all +the clients are up-to-date and properly configured, could hopefully +benefit more and earlier from using LOPs. + +What about fetches? +~~~~~~~~~~~~~~~~~~~ + +There are different kinds of fetches. A regular fetch happens when +some refs have been updated on the server and the client wants the ref +updates and possibly the new objects added with them. A "backfill" or +"lazy" fetch, on the contrary, happens when the client needs to use +some objects it already knows about but doesn't have because they are +on a promisor remote. + +Regular fetch ++++++++++++++ + +In a regular fetch, the client will contact the main remote and a +protocol negotiation will happen between them. It's a good thing that +a protocol negotiation happens every time, as the configuration on the +client or the main remote could have changed since the previous +protocol negotiation. In this case, the new protocol negotiation +should ensure that the new fetch will happen in a way that satisfies +the new configuration of both the client and the server. + +In most cases though, the configurations on the client and the main +remote will not have changed between 2 fetches or between the initial +clone and a subsequent fetch. This means that the result of a new +protocol negotiation will be the same as the previous result, so the +new fetch will happen in the same way as the previous clone or fetch, +using, or not using, the same LOP(s) as last time. + +"Backfill" or "lazy" fetch +++++++++++++++++++++++++++ + +When there is a backfill fetch, the client doesn't necessarily contact +the main remote first. It will try to fetch from its promisor remotes +in the order they appear in the config file, except that a remote +configured using the `extensions.partialClone` config variable will be +tried last. See +link:partial-clone.html#using-many-promisor-remotes[the partial clone documentation]. + +This is not new with this effort. In fact this is how multiple remotes +have already been working for around 5 years. + +When using LOPs, having the main remote configured using +`extensions.partialClone`, so it's tried last, makes sense, as missing +objects should only be large blobs that are on LOPs. + +This means that a protocol negotiation will likely not happen as the +missing objects will be fetched from the LOPs, and then there will be +nothing left to fetch from the main remote. + +To secure that, it could be a good idea for LOPs to require a token +from the client when it fetches from them. The client could get the +token when performing a protocol negotiation with the main remote (see +section II.6 above). + +V) Future improvements +---------------------- + +It is expected that at the beginning using LOPs will be mostly worth +it either in a corporate context where the Git version that clients +use can easily be controlled, or on repos that are infrequently +accessed. (See the "Could the main remote be bogged down by old or +paranoid clients?" section in the FAQ above.) + +Over time, as more and more clients upgrade to a version that +implements the "promisor-remote" protocol v2 capability described +above in section II.6), it will be worth it to use LOPs more widely. + +A lot of improvements may also help using LOPs more widely. Some of +these improvements are part of the scope of this document like the +following: + + - Implementing a "remote-object-info" command in the + `git cat-file --batch` protocol and its variants to allow main + remotes to respond to requests about large blobs without fetching + them. (Eric Ju has started working on this based on previous work + by Calvin Wan.) + + - Creating better cleanup and offload mechanisms for main remotes + and clients to prevent accumulation of large blobs. + + - Developing more sophisticated protocol negotiation capabilities + between clients and servers for handling LOPs, for example adding + a filter-spec (e.g., blob:limit=<size>) or size limit for + filtering when cloning, or adding a token for LOP authentication. + + - Improving security measures for LOP access, particularly around + token handling and authentication. + + - Developing standardized ways to configure and manage multiple LOPs + across different environments. Especially in the case where + different LOPs serve the same content to clients in different + geographical locations, there is a need for replication or + synchronization between LOPs. + +Some improvements, including some that have been mentioned in the "0) +Non Goals" section of this document, are out of the scope of this +document: + + - Implementing a new object representation for large blobs on the + client side. + + - Developing pluggable ODBs or other object database backends that + could chunk large blobs, dedup the chunks and store them + efficiently. + + - Optimizing data transfer between LOPs and clients/servers, + particularly for incompressible and non-deltifying content. + + - Creating improved client side tools for managing large objects + more effectively, for example tools for migrating from Git LFS or + git-annex, or tools to find which objects could be offloaded and + how much disk space could be reclaimed by offloading them. + +Some improvements could be seen as part of the scope of this document, +but might already have their own separate projects from the Git +project, like: + + - Improving existing remote helpers to access object storage or + developing new ones. + + - Improving existing object storage solutions or developing new + ones. + +Even though all the above improvements may help, this document and the +LOP effort should try to focus, at least first, on a relatively small +number of improvements mostly those that are in its current scope. + +For example introducing pluggable ODBs and a new object database +backend is likely a multi-year effort on its own that can happen +separately in parallel. It has different technical requirements, +touches other part of the Git code base and should have its own design +document(s). diff --git a/Documentation/technical/long-running-process-protocol.txt b/Documentation/technical/long-running-process-protocol.adoc index 6f33654b42..6f33654b42 100644 --- a/Documentation/technical/long-running-process-protocol.txt +++ b/Documentation/technical/long-running-process-protocol.adoc diff --git a/Documentation/technical/meson.build b/Documentation/technical/meson.build new file mode 100644 index 0000000000..a13aafcfbb --- /dev/null +++ b/Documentation/technical/meson.build @@ -0,0 +1,67 @@ +api_docs = [ + 'api-error-handling.adoc', + 'api-merge.adoc', + 'api-parse-options.adoc', + 'api-simple-ipc.adoc', + 'api-trace2.adoc', +] + +articles = [ + 'bitmap-format.adoc', + 'build-systems.adoc', + 'bundle-uri.adoc', + 'commit-graph.adoc', + 'directory-rename-detection.adoc', + 'hash-function-transition.adoc', + 'long-running-process-protocol.adoc', + 'multi-pack-index.adoc', + 'packfile-uri.adoc', + 'pack-heuristics.adoc', + 'parallel-checkout.adoc', + 'partial-clone.adoc', + 'platform-support.adoc', + 'racy-git.adoc', + 'reftable.adoc', + 'remembering-renames.adoc', + 'repository-version.adoc', + 'rerere.adoc', + 'scalar.adoc', + 'send-pack-pipeline.adoc', + 'shallow.adoc', + 'sparse-checkout.adoc', + 'sparse-index.adoc', + 'trivial-merge.adoc', + 'unit-tests.adoc', +] + +api_index = custom_target( + command: [ + shell, + meson.current_source_dir() / 'api-index.sh', + meson.current_source_dir(), + '@OUTPUT@', + ], + env: script_environment, + input: api_docs, + output: 'api-index.adoc', +) + +custom_target( + command: asciidoc_html_options, + input: api_index, + output: 'api-index.html', + depends: documentation_deps, + install: true, + install_dir: get_option('datadir') / 'doc/git-doc/technical', +) + +foreach article : api_docs + articles + custom_target( + command: asciidoc_html_options, + input: article, + output: fs.stem(article) + '.html', + depends: documentation_deps, + install: true, + install_dir: get_option('datadir') / 'doc/git-doc/technical', + ) +endforeach diff --git a/Documentation/technical/multi-pack-index.txt b/Documentation/technical/multi-pack-index.adoc index cc063b30be..ffda70aa13 100644 --- a/Documentation/technical/multi-pack-index.txt +++ b/Documentation/technical/multi-pack-index.adoc @@ -164,19 +164,81 @@ objects_nr($H2) + objects_nr($H1) + i (in the C implementation, this is often computed as `i + m->num_objects_in_base`). +=== Pseudo-pack order for incremental MIDXs + +The original implementation of multi-pack reachability bitmaps defined +the pseudo-pack order in linkgit:gitformat-pack[5] (see the section +titled "multi-pack-index reverse indexes") roughly as follows: + +____ +In short, a MIDX's pseudo-pack is the de-duplicated concatenation of +objects in packs stored by the MIDX, laid out in pack order, and the +packs arranged in MIDX order (with the preferred pack coming first). +____ + +In the incremental MIDX design, we extend this definition to include +objects from multiple layers of the MIDX chain. The pseudo-pack order +for incremental MIDXs is determined by concatenating the pseudo-pack +ordering for each layer of the MIDX chain in order. Formally two objects +`o1` and `o2` are compared as follows: + +1. If `o1` appears in an earlier layer of the MIDX chain than `o2`, then + `o1` sorts ahead of `o2`. + +2. Otherwise, if `o1` and `o2` appear in the same MIDX layer, and that + MIDX layer has no base, then if one of `pack(o1)` and `pack(o2)` is + preferred and the other is not, then the preferred one sorts ahead of + the non-preferred one. If there is a base layer (i.e. the MIDX layer + is not the first layer in the chain), then if `pack(o1)` appears + earlier in that MIDX layer's pack order, then `o1` sorts ahead of + `o2`. Likewise if `pack(o2)` appears earlier, then the opposite is + true. + +3. Otherwise, `o1` and `o2` appear in the same pack, and thus in the + same MIDX layer. Sort `o1` and `o2` by their offset within their + containing packfile. + +Note that the preferred pack is a property of the MIDX chain, not the +individual layers themselves. Fundamentally we could introduce a +per-layer preferred pack, but this is less relevant now that we can +perform multi-pack reuse across the set of packs in a MIDX. + +=== Reachability bitmaps and incremental MIDXs + +Each layer of an incremental MIDX chain may have its objects (and the +objects from any previous layer in the same MIDX chain) represented in +its own `*.bitmap` file. + +The structure of a `*.bitmap` file belonging to an incremental MIDX +chain is identical to that of a non-incremental MIDX bitmap, or a +classic single-pack bitmap. Since objects are added to the end of the +incremental MIDX's pseudo-pack order (see above), it is possible to +extend a bitmap when appending to the end of a MIDX chain. + +(Note: it is possible likewise to compress a contiguous sequence of MIDX +incremental layers, and their `*.bitmap` files into a single layer and +`*.bitmap`, but this is not yet implemented.) + +The object positions used are global within the pseudo-pack order, so +subsequent layers will have, for example, `m->num_objects_in_base` +number of `0` bits in each of their four type bitmaps. This follows from +the fact that we only write type bitmap entries for objects present in +the layer immediately corresponding to the bitmap). + +Note also that only the bitmap pertaining to the most recent layer in an +incremental MIDX chain is used to store reachability information about +the interesting and uninteresting objects in a reachability query. +Earlier bitmap layers are only used to look up commit and pseudo-merge +bitmaps from that layer, as well as the type-level bitmaps for objects +in that layer. + +To simplify the implementation, type-level bitmaps are iterated +simultaneously, and their results are OR'd together to avoid recursively +calling internal bitmap functions. + Future Work ----------- -- The multi-pack-index allows many packfiles, especially in a context - where repacking is expensive (such as a very large repo), or - unexpected maintenance time is unacceptable (such as a high-demand - build machine). However, the multi-pack-index needs to be rewritten - in full every time. We can extend the format to be incremental, so - writes are fast. By storing a small "tip" multi-pack-index that - points to large "base" MIDX files, we can keep writes fast while - still reducing the number of binary searches required for object - lookups. - - If the multi-pack-index is extended to store a "stable object order" (a function Order(hash) = integer that is constant for a given hash, even as the multi-pack-index is updated) then MIDX bitmaps could be diff --git a/Documentation/technical/pack-heuristics.txt b/Documentation/technical/pack-heuristics.adoc index 95a07db6e8..95a07db6e8 100644 --- a/Documentation/technical/pack-heuristics.txt +++ b/Documentation/technical/pack-heuristics.adoc diff --git a/Documentation/technical/packfile-uri.txt b/Documentation/technical/packfile-uri.adoc index 9d453d4765..9d453d4765 100644 --- a/Documentation/technical/packfile-uri.txt +++ b/Documentation/technical/packfile-uri.adoc diff --git a/Documentation/technical/parallel-checkout.txt b/Documentation/technical/parallel-checkout.adoc index b4a144e5f4..b4a144e5f4 100644 --- a/Documentation/technical/parallel-checkout.txt +++ b/Documentation/technical/parallel-checkout.adoc diff --git a/Documentation/technical/partial-clone.txt b/Documentation/technical/partial-clone.adoc index cd948b0072..e513e391ea 100644 --- a/Documentation/technical/partial-clone.txt +++ b/Documentation/technical/partial-clone.adoc @@ -85,7 +85,7 @@ See "filter" in linkgit:gitprotocol-pack[5]. server to request filtering during packfile construction. + There are various filters available to accommodate different situations. -See "--filter=<filter-spec>" in Documentation/rev-list-options.txt. +See "--filter=<filter-spec>" in Documentation/rev-list-options.adoc. - On the server pack-objects applies the requested filter-spec as it creates "filtered" packfiles for the client. @@ -102,7 +102,7 @@ or commits that reference missing trees. - On the client a repository extension is added to the local config to prevent older versions of git from failing mid-operation because of missing objects that they cannot handle. - See "extensions.partialClone" in Documentation/technical/repository-version.txt" + See `extensions.partialClone` in linkgit:git-config[1]. Handling Missing Objects diff --git a/Documentation/technical/platform-support.txt b/Documentation/technical/platform-support.adoc index 0a2fb28d62..0a2fb28d62 100644 --- a/Documentation/technical/platform-support.txt +++ b/Documentation/technical/platform-support.adoc diff --git a/Documentation/technical/racy-git.txt b/Documentation/technical/racy-git.adoc index 59bea66c0f..59bea66c0f 100644 --- a/Documentation/technical/racy-git.txt +++ b/Documentation/technical/racy-git.adoc diff --git a/Documentation/technical/reftable.txt b/Documentation/technical/reftable.adoc index dd0b37c4e3..dd0b37c4e3 100644 --- a/Documentation/technical/reftable.txt +++ b/Documentation/technical/reftable.adoc diff --git a/Documentation/technical/remembering-renames.txt b/Documentation/technical/remembering-renames.adoc index 73f41761e2..73f41761e2 100644 --- a/Documentation/technical/remembering-renames.txt +++ b/Documentation/technical/remembering-renames.adoc diff --git a/Documentation/technical/repository-version.txt b/Documentation/technical/repository-version.adoc index 47281420fc..b9bb81a81f 100644 --- a/Documentation/technical/repository-version.txt +++ b/Documentation/technical/repository-version.adoc @@ -65,44 +65,6 @@ Note that if no extensions are specified in the config file, then provides no benefit, and makes the repository incompatible with older implementations of git). -This document will serve as the master list for extensions. Any -implementation wishing to define a new extension should make a note of -it here, in order to claim the name. - -The defined extensions are: - -==== `noop` - -This extension does not change git's behavior at all. It is useful only -for testing format-1 compatibility. - -==== `preciousObjects` - -When the config key `extensions.preciousObjects` is set to `true`, -objects in the repository MUST NOT be deleted (e.g., by `git-prune` or -`git repack -d`). - -==== `partialClone` - -When the config key `extensions.partialClone` is set, it indicates -that the repo was created with a partial clone (or later performed -a partial fetch) and that the remote may have omitted sending -certain unwanted objects. Such a remote is called a "promisor remote" -and it promises that all such omitted objects can be fetched from it -in the future. - -The value of this key is the name of the promisor remote. - -==== `worktreeConfig` - -If set, by default "git config" reads from both "config" and -"config.worktree" files from GIT_DIR in that order. In -multiple working directory mode, "config" file is shared while -"config.worktree" is per-working directory (i.e., it's in -GIT_COMMON_DIR/worktrees/<id>/config.worktree) - -==== `refStorage` - -Specifies the file format for the ref database. The valid values are -`files` (loose references with a packed-refs file) and `reftable` (see -Documentation/technical/reftable.txt). +The defined extensions are given in the `extensions.*` section of +linkgit:git-config[1]. Any implementation wishing to define a new +extension should make a note of it there, in order to claim the name. diff --git a/Documentation/technical/rerere.txt b/Documentation/technical/rerere.adoc index 580f23360a..580f23360a 100644 --- a/Documentation/technical/rerere.txt +++ b/Documentation/technical/rerere.adoc diff --git a/Documentation/technical/scalar.txt b/Documentation/technical/scalar.adoc index 921cb104c3..921cb104c3 100644 --- a/Documentation/technical/scalar.txt +++ b/Documentation/technical/scalar.adoc diff --git a/Documentation/technical/send-pack-pipeline.txt b/Documentation/technical/send-pack-pipeline.adoc index 9b5a0bc186..9b5a0bc186 100644 --- a/Documentation/technical/send-pack-pipeline.txt +++ b/Documentation/technical/send-pack-pipeline.adoc diff --git a/Documentation/technical/shallow.txt b/Documentation/technical/shallow.adoc index f3738baa0f..f3738baa0f 100644 --- a/Documentation/technical/shallow.txt +++ b/Documentation/technical/shallow.adoc diff --git a/Documentation/technical/sparse-checkout.txt b/Documentation/technical/sparse-checkout.adoc index d968659354..0f750ef3e3 100644 --- a/Documentation/technical/sparse-checkout.txt +++ b/Documentation/technical/sparse-checkout.adoc @@ -66,7 +66,7 @@ sparsity patterns: patterns from $GIT_DIR/info/sparse-checkout used to reasons: (1) users in cone mode specify directories rather than patterns (their directories are transformed into patterns, but users may think you are talking about non-cone mode if you use the - word "patterns"), and (b) the sparse specification might + word "patterns"), and (2) the sparse specification might transiently differ in the working tree or index from the sparsity patterns (see "Sparse specification vs. sparsity patterns"). @@ -356,8 +356,6 @@ understanding these differences can be beneficial. The behavior for these commands somewhat depends upon the merge strategy being used: * `ort` behaves as described above - * `recursive` tries to not vivify files unnecessarily, but does sometimes - vivify files without conflicts. * `octopus` and `resolve` will always vivify any file changed in the merge relative to the first parent, which is rather suboptimal. @@ -442,7 +440,7 @@ understanding these differences can be beneficial. * blame (only matters when one or more -C flags are passed) * and annotate * log - * whatchanged + * whatchanged (may not exist anymore) * ls-files * diff-index * diff-tree diff --git a/Documentation/technical/sparse-index.txt b/Documentation/technical/sparse-index.adoc index 3b24c1a219..3b24c1a219 100644 --- a/Documentation/technical/sparse-index.txt +++ b/Documentation/technical/sparse-index.adoc diff --git a/Documentation/technical/trivial-merge.txt b/Documentation/technical/trivial-merge.adoc index 1f1c33d0da..1f1c33d0da 100644 --- a/Documentation/technical/trivial-merge.txt +++ b/Documentation/technical/trivial-merge.adoc diff --git a/Documentation/technical/unit-tests.txt b/Documentation/technical/unit-tests.adoc index 5a432b7b29..5a432b7b29 100644 --- a/Documentation/technical/unit-tests.txt +++ b/Documentation/technical/unit-tests.adoc |
