32 files changed, 1232 insertions, 1439 deletions
diff --git a/reftable/basics.c b/reftable/basics.c
index 0785aff941..fea711db7e 100644
--- a/reftable/basics.c
+++ b/reftable/basics.c
@@ -27,7 +27,7 @@ void put_be16(uint8_t *out, uint16_t i)
 	out[1] = (uint8_t)(i & 0xff);
 }
 
-int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
 {
 	size_t lo = 0;
 	size_t hi = sz;
@@ -39,8 +39,11 @@ int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
 	 */
 	while (hi - lo > 1) {
 		size_t mid = lo + (hi - lo) / 2;
+		int ret = f(mid, args);
+		if (ret < 0)
+			return sz;
 
-		if (f(mid, args))
+		if (ret > 0)
 			hi = mid;
 		else
 			lo = mid;
diff --git a/reftable/basics.h b/reftable/basics.h
index 91f3533efe..523ecd5307 100644
--- a/reftable/basics.h
+++ b/reftable/basics.h
@@ -22,13 +22,14 @@ uint32_t get_be24(uint8_t *in);
 void put_be16(uint8_t *out, uint16_t i);
 
 /*
- * find smallest index i in [0, sz) at which f(i) is true, assuming
- * that f is ascending. Return sz if f(i) is false for all indices.
+ * find smallest index i in [0, sz) at which `f(i) > 0`, assuming that f is
+ * ascending. Return sz if `f(i) == 0` for all indices. The search is aborted
+ * and `sz` is returned in case `f(i) < 0`.
  *
  * Contrary to bsearch(3), this returns something useful if the argument is not
  * found.
  */
-int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
 
 /*
  * Frees a NULL terminated array of malloced strings. The array itself is also
diff --git a/reftable/basics_test.c b/reftable/basics_test.c
index 1fcd229725..997c4d9e01 100644
--- a/reftable/basics_test.c
+++ b/reftable/basics_test.c
@@ -12,40 +12,47 @@ https://developers.google.com/open-source/licenses/bsd
 #include "test_framework.h"
 #include "reftable-tests.h"
 
-struct binsearch_args {
-	int key;
-	int *arr;
+struct integer_needle_lesseq_args {
+	int needle;
+	int *haystack;
 };
 
-static int binsearch_func(size_t i, void *void_args)
+static int integer_needle_lesseq(size_t i, void *_args)
 {
-	struct binsearch_args *args = void_args;
-
-	return args->key < args->arr[i];
+	struct integer_needle_lesseq_args *args = _args;
+	return args->needle <= args->haystack[i];
 }
 
 static void test_binsearch(void)
 {
-	int arr[] = { 2, 4, 6, 8, 10 };
-	size_t sz = ARRAY_SIZE(arr);
-	struct binsearch_args args = {
-		.arr = arr,
+	int haystack[] = { 2, 4, 6, 8, 10 };
+	struct {
+		int needle;
+		size_t expected_idx;
+	} testcases[] = {
+		{-9000, 0},
+		{-1, 0},
+		{0, 0},
+		{2, 0},
+		{3, 1},
+		{4, 1},
+		{7, 3},
+		{9, 4},
+		{10, 4},
+		{11, 5},
+		{9000, 5},
 	};
+	size_t i = 0;
 
-	int i = 0;
-	for (i = 1; i < 11; i++) {
-		int res;
-		args.key = i;
-		res = binsearch(sz, &binsearch_func, &args);
+	for (i = 0; i < ARRAY_SIZE(testcases); i++) {
+		struct integer_needle_lesseq_args args = {
+			.haystack = haystack,
+			.needle = testcases[i].needle,
+		};
+		size_t idx;
 
-		if (res < sz) {
-			EXPECT(args.key < arr[res]);
-			if (res > 0) {
-				EXPECT(args.key >= arr[res - 1]);
-			}
-		} else {
-			EXPECT(args.key == 10 || args.key == 11);
-		}
+		idx = binsearch(ARRAY_SIZE(haystack), &integer_needle_lesseq, &args);
+		EXPECT(idx == testcases[i].expected_idx);
 	}
 }
 
diff --git a/reftable/block.c b/reftable/block.c
index ce8a7d24f3..5942cb4053 100644
--- a/reftable/block.c
+++ b/reftable/block.c
@@ -76,6 +76,10 @@ void block_writer_init(struct block_writer *bw, uint8_t typ, uint8_t *buf,
 	bw->entries = 0;
 	bw->restart_len = 0;
 	bw->last_key.len = 0;
+	if (!bw->zstream) {
+		REFTABLE_CALLOC_ARRAY(bw->zstream, 1);
+		deflateInit(bw->zstream, 9);
+	}
 }
 
 uint8_t block_writer_type(struct block_writer *bw)
@@ -139,45 +143,53 @@ int block_writer_finish(struct block_writer *w)
 	w->next += 2;
 	put_be24(w->buf + 1 + w->header_off, w->next);
 
+	/*
+	 * Log records are stored zlib-compressed. Note that the compression
+	 * also spans over the restart points we have just written.
+	 */
 	if (block_writer_type(w) == BLOCK_TYPE_LOG) {
 		int block_header_skip = 4 + w->header_off;
-		uLongf src_len = w->next - block_header_skip;
-		uLongf dest_cap = src_len * 1.001 + 12;
-		uint8_t *compressed;
-
-		REFTABLE_ALLOC_ARRAY(compressed, dest_cap);
-
-		while (1) {
-			uLongf out_dest_len = dest_cap;
-			int zresult = compress2(compressed, &out_dest_len,
-						w->buf + block_header_skip,
-						src_len, 9);
-			if (zresult == Z_BUF_ERROR && dest_cap < LONG_MAX) {
-				dest_cap *= 2;
-				compressed =
-					reftable_realloc(compressed, dest_cap);
-				if (compressed)
-					continue;
-			}
-
-			if (Z_OK != zresult) {
-				reftable_free(compressed);
-				return REFTABLE_ZLIB_ERROR;
-			}
-
-			memcpy(w->buf + block_header_skip, compressed,
-			       out_dest_len);
-			w->next = out_dest_len + block_header_skip;
-			reftable_free(compressed);
-			break;
-		}
+		uLongf src_len = w->next - block_header_skip, compressed_len;
+		int ret;
+
+		ret = deflateReset(w->zstream);
+		if (ret != Z_OK)
+			return REFTABLE_ZLIB_ERROR;
+
+		/*
+		 * Precompute the upper bound of how many bytes the compressed
+		 * data may end up with. Combined with `Z_FINISH`, `deflate()`
+		 * is guaranteed to return `Z_STREAM_END`.
+		 */
+		compressed_len = deflateBound(w->zstream, src_len);
+		REFTABLE_ALLOC_GROW(w->compressed, compressed_len, w->compressed_cap);
+
+		w->zstream->next_out = w->compressed;
+		w->zstream->avail_out = compressed_len;
+		w->zstream->next_in = w->buf + block_header_skip;
+		w->zstream->avail_in = src_len;
+
+		/*
+		 * We want to perform all decompression in a single step, which
+		 * is why we can pass Z_FINISH here. As we have precomputed the
+		 * deflated buffer's size via `deflateBound()` this function is
+		 * guaranteed to succeed according to the zlib documentation.
+		 */
+		ret = deflate(w->zstream, Z_FINISH);
+		if (ret != Z_STREAM_END)
+			return REFTABLE_ZLIB_ERROR;
+
+		/*
+		 * Overwrite the uncompressed data we have already written and
+		 * adjust the `next` pointer to point right after the
+		 * compressed data.
+		 */
+		memcpy(w->buf + block_header_skip, w->compressed,
+		       w->zstream->total_out);
+		w->next = w->zstream->total_out + block_header_skip;
 	}
-	return w->next;
-}
 
-uint8_t block_reader_type(struct block_reader *r)
-{
-	return r->block.data[r->header_off];
+	return w->next;
 }
 
 int block_reader_init(struct block_reader *br, struct reftable_block *block,
@@ -191,7 +203,8 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	uint16_t restart_count = 0;
 	uint32_t restart_start = 0;
 	uint8_t *restart_bytes = NULL;
-	uint8_t *uncompressed = NULL;
+
+	reftable_block_done(&br->block);
 
 	if (!reftable_is_block_type(typ)) {
 		err =  REFTABLE_FORMAT_ERROR;
@@ -199,37 +212,57 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	}
 
 	if (typ == BLOCK_TYPE_LOG) {
-		int block_header_skip = 4 + header_off;
-		uLongf dst_len = sz - block_header_skip; /* total size of dest
-							    buffer. */
-		uLongf src_len = block->len - block_header_skip;
+		uint32_t block_header_skip = 4 + header_off;
+		uLong dst_len = sz - block_header_skip;
+		uLong src_len = block->len - block_header_skip;
 
 		/* Log blocks specify the *uncompressed* size in their header. */
-		REFTABLE_ALLOC_ARRAY(uncompressed, sz);
+		REFTABLE_ALLOC_GROW(br->uncompressed_data, sz,
+				    br->uncompressed_cap);
 
 		/* Copy over the block header verbatim. It's not compressed. */
-		memcpy(uncompressed, block->data, block_header_skip);
+		memcpy(br->uncompressed_data, block->data, block_header_skip);
 
-		/* Uncompress */
-		if (Z_OK !=
-		    uncompress2(uncompressed + block_header_skip, &dst_len,
-				block->data + block_header_skip, &src_len)) {
+		if (!br->zstream) {
+			REFTABLE_CALLOC_ARRAY(br->zstream, 1);
+			err = inflateInit(br->zstream);
+		} else {
+			err = inflateReset(br->zstream);
+		}
+		if (err != Z_OK) {
 			err = REFTABLE_ZLIB_ERROR;
 			goto done;
 		}
 
-		if (dst_len + block_header_skip != sz) {
+		br->zstream->next_in = block->data + block_header_skip;
+		br->zstream->avail_in = src_len;
+		br->zstream->next_out = br->uncompressed_data + block_header_skip;
+		br->zstream->avail_out = dst_len;
+
+		/*
+		 * We know both input as well as output size, and we know that
+		 * the sizes should never be bigger than `uInt_MAX` because
+		 * blocks can at most be 16MB large. We can thus use `Z_FINISH`
+		 * here to instruct zlib to inflate the data in one go, which
+		 * is more efficient than using `Z_NO_FLUSH`.
+		 */
+		err = inflate(br->zstream, Z_FINISH);
+		if (err != Z_STREAM_END) {
+			err = REFTABLE_ZLIB_ERROR;
+			goto done;
+		}
+		err = 0;
+
+		if (br->zstream->total_out + block_header_skip != sz) {
 			err = REFTABLE_FORMAT_ERROR;
 			goto done;
 		}
 
 		/* We're done with the input data. */
 		reftable_block_done(block);
-		block->data = uncompressed;
-		uncompressed = NULL;
+		block->data = br->uncompressed_data;
 		block->len = sz;
-		block->source = malloc_block_source();
-		full_block_size = src_len + block_header_skip;
+		full_block_size = src_len + block_header_skip - br->zstream->avail_in;
 	} else if (full_block_size == 0) {
 		full_block_size = sz;
 	} else if (sz < full_block_size && sz < block->len &&
@@ -257,177 +290,251 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	br->restart_bytes = restart_bytes;
 
 done:
-	reftable_free(uncompressed);
 	return err;
 }
 
-static uint32_t block_reader_restart_offset(struct block_reader *br, int i)
+void block_reader_release(struct block_reader *br)
+{
+	inflateEnd(br->zstream);
+	reftable_free(br->zstream);
+	reftable_free(br->uncompressed_data);
+	reftable_block_done(&br->block);
+}
+
+uint8_t block_reader_type(const struct block_reader *r)
+{
+	return r->block.data[r->header_off];
+}
+
+int block_reader_first_key(const struct block_reader *br, struct strbuf *key)
+{
+	int off = br->header_off + 4, n;
+	struct string_view in = {
+		.buf = br->block.data + off,
+		.len = br->block_len - off,
+	};
+	uint8_t extra = 0;
+
+	strbuf_reset(key);
+
+	n = reftable_decode_key(key, &extra, in);
+	if (n < 0)
+		return n;
+	if (!key->len)
+		return REFTABLE_FORMAT_ERROR;
+
+	return 0;
+}
+
+static uint32_t block_reader_restart_offset(const struct block_reader *br, int i)
 {
 	return get_be24(br->restart_bytes + 3 * i);
 }
 
-void block_reader_start(struct block_reader *br, struct block_iter *it)
+void block_iter_seek_start(struct block_iter *it, const struct block_reader *br)
 {
-	it->br = br;
+	it->block = br->block.data;
+	it->block_len = br->block_len;
+	it->hash_size = br->hash_size;
 	strbuf_reset(&it->last_key);
 	it->next_off = br->header_off + 4;
 }
 
-struct restart_find_args {
+struct restart_needle_less_args {
 	int error;
-	struct strbuf key;
-	struct block_reader *r;
+	struct strbuf needle;
+	const struct block_reader *reader;
 };
 
-static int restart_key_less(size_t idx, void *args)
+static int restart_needle_less(size_t idx, void *_args)
 {
-	struct restart_find_args *a = args;
-	uint32_t off = block_reader_restart_offset(a->r, idx);
+	struct restart_needle_less_args *args = _args;
+	uint32_t off = block_reader_restart_offset(args->reader, idx);
 	struct string_view in = {
-		.buf = a->r->block.data + off,
-		.len = a->r->block_len - off,
+		.buf = args->reader->block.data + off,
+		.len = args->reader->block_len - off,
 	};
-
-	/* the restart key is verbatim in the block, so this could avoid the
-	   alloc for decoding the key */
-	struct strbuf rkey = STRBUF_INIT;
-	struct strbuf last_key = STRBUF_INIT;
-	uint8_t unused_extra;
-	int n = reftable_decode_key(&rkey, &unused_extra, last_key, in);
-	int result;
-	if (n < 0) {
-		a->error = 1;
+	uint64_t prefix_len, suffix_len;
+	uint8_t extra;
+	int n;
+
+	/*
+	 * Records at restart points are stored without prefix compression, so
+	 * there is no need to fully decode the record key here. This removes
+	 * the need for allocating memory.
+	 */
+	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
+	if (n < 0 || prefix_len) {
+		args->error = 1;
 		return -1;
 	}
 
-	result = strbuf_cmp(&a->key, &rkey);
-	strbuf_release(&rkey);
-	return result;
-}
+	string_view_consume(&in, n);
+	if (suffix_len > in.len) {
+		args->error = 1;
+		return -1;
+	}
 
-void block_iter_copy_from(struct block_iter *dest, struct block_iter *src)
-{
-	dest->br = src->br;
-	dest->next_off = src->next_off;
-	strbuf_reset(&dest->last_key);
-	strbuf_addbuf(&dest->last_key, &src->last_key);
+	n = memcmp(args->needle.buf, in.buf,
+		   args->needle.len < suffix_len ? args->needle.len : suffix_len);
+	if (n)
+		return n < 0;
+	return args->needle.len < suffix_len;
 }
 
 int block_iter_next(struct block_iter *it, struct reftable_record *rec)
 {
 	struct string_view in = {
-		.buf = it->br->block.data + it->next_off,
-		.len = it->br->block_len - it->next_off,
+		.buf = (unsigned char *) it->block + it->next_off,
+		.len = it->block_len - it->next_off,
 	};
 	struct string_view start = in;
 	uint8_t extra = 0;
 	int n = 0;
 
-	if (it->next_off >= it->br->block_len)
+	if (it->next_off >= it->block_len)
 		return 1;
 
-	n = reftable_decode_key(&it->key, &extra, it->last_key, in);
+	n = reftable_decode_key(&it->last_key, &extra, in);
 	if (n < 0)
 		return -1;
-
-	if (!it->key.len)
+	if (!it->last_key.len)
 		return REFTABLE_FORMAT_ERROR;
 
 	string_view_consume(&in, n);
-	n = reftable_record_decode(rec, it->key, extra, in, it->br->hash_size);
+	n = reftable_record_decode(rec, it->last_key, extra, in, it->hash_size,
+				   &it->scratch);
 	if (n < 0)
 		return -1;
 	string_view_consume(&in, n);
 
-	strbuf_reset(&it->last_key);
-	strbuf_addbuf(&it->last_key, &it->key);
 	it->next_off += start.len - in.len;
 	return 0;
 }
 
-int block_reader_first_key(struct block_reader *br, struct strbuf *key)
-{
-	struct strbuf empty = STRBUF_INIT;
-	int off = br->header_off + 4;
-	struct string_view in = {
-		.buf = br->block.data + off,
-		.len = br->block_len - off,
-	};
-
-	uint8_t extra = 0;
-	int n = reftable_decode_key(key, &extra, empty, in);
-	if (n < 0)
-		return n;
-	if (!key->len)
-		return REFTABLE_FORMAT_ERROR;
-
-	return 0;
-}
-
-int block_iter_seek(struct block_iter *it, struct strbuf *want)
+void block_iter_reset(struct block_iter *it)
 {
-	return block_reader_seek(it->br, it, want);
+	strbuf_reset(&it->last_key);
+	it->next_off = 0;
+	it->block = NULL;
+	it->block_len = 0;
+	it->hash_size = 0;
 }
 
 void block_iter_close(struct block_iter *it)
 {
 	strbuf_release(&it->last_key);
-	strbuf_release(&it->key);
+	strbuf_release(&it->scratch);
 }
 
-int block_reader_seek(struct block_reader *br, struct block_iter *it,
-		      struct strbuf *want)
+int block_iter_seek_key(struct block_iter *it, const struct block_reader *br,
+			struct strbuf *want)
 {
-	struct restart_find_args args = {
-		.key = *want,
-		.r = br,
+	struct restart_needle_less_args args = {
+		.needle = *want,
+		.reader = br,
 	};
-	struct block_iter next = BLOCK_ITER_INIT;
 	struct reftable_record rec;
-	int err = 0, i;
-
+	int err = 0;
+	size_t i;
+
+	/*
+	 * Perform a binary search over the block's restart points, which
+	 * avoids doing a linear scan over the whole block. Like this, we
+	 * identify the section of the block that should contain our key.
+	 *
+	 * Note that we explicitly search for the first restart point _greater_
+	 * than the sought-after record, not _greater or equal_ to it. In case
+	 * the sought-after record is located directly at the restart point we
+	 * would otherwise start doing the linear search at the preceding
+	 * restart point. While that works alright, we would end up scanning
+	 * too many record.
+	 */
+	i = binsearch(br->restart_count, &restart_needle_less, &args);
 	if (args.error) {
 		err = REFTABLE_FORMAT_ERROR;
 		goto done;
 	}
 
-	i = binsearch(br->restart_count, &restart_key_less, &args);
+	/*
+	 * Now there are multiple cases:
+	 *
+	 *   - `i == 0`: The wanted record is smaller than the record found at
+	 *     the first restart point. As the first restart point is the first
+	 *     record in the block, our wanted record cannot be located in this
+	 *     block at all. We still need to position the iterator so that the
+	 *     next call to `block_iter_next()` will yield an end-of-iterator
+	 *     signal.
+	 *
+	 *   - `i == restart_count`: The wanted record was not found at any of
+	 *     the restart points. As there is no restart point at the end of
+	 *     the section the record may thus be contained in the last block.
+	 *
+	 *   - `i > 0`: The wanted record must be contained in the section
+	 *     before the found restart point. We thus do a linear search
+	 *     starting from the preceding restart point.
+	 */
 	if (i > 0)
 		it->next_off = block_reader_restart_offset(br, i - 1);
 	else
 		it->next_off = br->header_off + 4;
-	it->br = br;
+	it->block = br->block.data;
+	it->block_len = br->block_len;
+	it->hash_size = br->hash_size;
 
 	reftable_record_init(&rec, block_reader_type(br));
 
-	/* We're looking for the last entry less/equal than the wanted key, so
-	   we have to go one entry too far and then back up.
-	*/
+	/*
+	 * We're looking for the last entry less than the wanted key so that
+	 * the next call to `block_reader_next()` would yield the wanted
+	 * record. We thus don't want to position our reader at the sought
+	 * after record, but one before. To do so, we have to go one entry too
+	 * far and then back up.
+	 */
 	while (1) {
-		block_iter_copy_from(&next, it);
-		err = block_iter_next(&next, &rec);
+		size_t prev_off = it->next_off;
+
+		err = block_iter_next(it, &rec);
 		if (err < 0)
 			goto done;
-
-		reftable_record_key(&rec, &it->key);
-		if (err > 0 || strbuf_cmp(&it->key, want) >= 0) {
+		if (err > 0) {
+			it->next_off = prev_off;
 			err = 0;
 			goto done;
 		}
 
-		block_iter_copy_from(it, &next);
+		/*
+		 * Check whether the current key is greater or equal to the
+		 * sought-after key. In case it is greater we know that the
+		 * record does not exist in the block and can thus abort early.
+		 * In case it is equal to the sought-after key we have found
+		 * the desired record.
+		 *
+		 * Note that we store the next record's key record directly in
+		 * `last_key` without restoring the key of the preceding record
+		 * in case we need to go one record back. This is safe to do as
+		 * `block_iter_next()` would return the ref whose key is equal
+		 * to `last_key` now, and naturally all keys share a prefix
+		 * with themselves.
+		 */
+		reftable_record_key(&rec, &it->last_key);
+		if (strbuf_cmp(&it->last_key, want) >= 0) {
+			it->next_off = prev_off;
+			goto done;
+		}
 	}
 
 done:
-	block_iter_close(&next);
 	reftable_record_release(&rec);
-
 	return err;
 }
 
 void block_writer_release(struct block_writer *bw)
 {
+	deflateEnd(bw->zstream);
+	FREE_AND_NULL(bw->zstream);
 	FREE_AND_NULL(bw->restarts);
+	FREE_AND_NULL(bw->compressed);
 	strbuf_release(&bw->last_key);
 	/* the block is not owned. */
 }
diff --git a/reftable/block.h b/reftable/block.h
index 17481e6331..e91f3d2790 100644
--- a/reftable/block.h
+++ b/reftable/block.h
@@ -18,6 +18,10 @@ https://developers.google.com/open-source/licenses/bsd
  * allocation overhead.
  */
 struct block_writer {
+	z_stream *zstream;
+	unsigned char *compressed;
+	size_t compressed_cap;
+
 	uint8_t *buf;
 	uint32_t block_size;
 
@@ -56,6 +60,8 @@ int block_writer_finish(struct block_writer *w);
 /* clears out internally allocated block_writer members. */
 void block_writer_release(struct block_writer *bw);
 
+struct z_stream;
+
 /* Read a block. */
 struct block_reader {
 	/* offset of the block header; nonzero for the first block in a
@@ -66,6 +72,11 @@ struct block_reader {
 	struct reftable_block block;
 	int hash_size;
 
+	/* Uncompressed data for log entries. */
+	z_stream *zstream;
+	unsigned char *uncompressed_data;
+	size_t uncompressed_cap;
+
 	/* size of the data, excluding restart data. */
 	uint32_t block_len;
 	uint8_t *restart_bytes;
@@ -76,47 +87,49 @@ struct block_reader {
 	uint32_t full_block_size;
 };
 
+/* initializes a block reader. */
+int block_reader_init(struct block_reader *br, struct reftable_block *bl,
+		      uint32_t header_off, uint32_t table_block_size,
+		      int hash_size);
+
+void block_reader_release(struct block_reader *br);
+
+/* Returns the block type (eg. 'r' for refs) */
+uint8_t block_reader_type(const struct block_reader *r);
+
+/* Decodes the first key in the block */
+int block_reader_first_key(const struct block_reader *br, struct strbuf *key);
+
 /* Iterate over entries in a block */
 struct block_iter {
 	/* offset within the block of the next entry to read. */
 	uint32_t next_off;
-	struct block_reader *br;
+	const unsigned char *block;
+	size_t block_len;
+	int hash_size;
 
 	/* key for last entry we read. */
 	struct strbuf last_key;
-	struct strbuf key;
+	struct strbuf scratch;
 };
 
 #define BLOCK_ITER_INIT { \
 	.last_key = STRBUF_INIT, \
-	.key = STRBUF_INIT, \
+	.scratch = STRBUF_INIT, \
 }
 
-/* initializes a block reader. */
-int block_reader_init(struct block_reader *br, struct reftable_block *bl,
-		      uint32_t header_off, uint32_t table_block_size,
-		      int hash_size);
-
 /* Position `it` at start of the block */
-void block_reader_start(struct block_reader *br, struct block_iter *it);
+void block_iter_seek_start(struct block_iter *it, const struct block_reader *br);
 
 /* Position `it` to the `want` key in the block */
-int block_reader_seek(struct block_reader *br, struct block_iter *it,
-		      struct strbuf *want);
-
-/* Returns the block type (eg. 'r' for refs) */
-uint8_t block_reader_type(struct block_reader *r);
-
-/* Decodes the first key in the block */
-int block_reader_first_key(struct block_reader *br, struct strbuf *key);
-
-void block_iter_copy_from(struct block_iter *dest, struct block_iter *src);
+int block_iter_seek_key(struct block_iter *it, const struct block_reader *br,
+			struct strbuf *want);
 
 /* return < 0 for error, 0 for OK, > 0 for EOF. */
 int block_iter_next(struct block_iter *it, struct reftable_record *rec);
 
-/* Seek to `want` with in the block pointed to by `it` */
-int block_iter_seek(struct block_iter *it, struct strbuf *want);
+/* Reset the block iterator to pristine state without releasing its memory. */
+void block_iter_reset(struct block_iter *it);
 
 /* deallocate memory for `it`. The block reader and its block is left intact. */
 void block_iter_close(struct block_iter *it);
diff --git a/reftable/block_test.c b/reftable/block_test.c
index e162c6e33f..26a9cfbc83 100644
--- a/reftable/block_test.c
+++ b/reftable/block_test.c
@@ -69,7 +69,7 @@ static void test_block_read_write(void)
 
 	block_reader_init(&br, &block, header_off, block_size, GIT_SHA1_RAWSZ);
 
-	block_reader_start(&br, &it);
+	block_iter_seek_start(&it, &br);
 
 	while (1) {
 		int r = block_iter_next(&it, &rec);
@@ -89,7 +89,7 @@ static void test_block_read_write(void)
 		strbuf_reset(&want);
 		strbuf_addstr(&want, names[i]);
 
-		n = block_reader_seek(&br, &it, &want);
+		n = block_iter_seek_key(&it, &br, &want);
 		EXPECT(n == 0);
 
 		n = block_iter_next(&it, &rec);
@@ -98,7 +98,7 @@ static void test_block_read_write(void)
 		EXPECT_STREQ(names[i], rec.u.ref.refname);
 
 		want.len--;
-		n = block_reader_seek(&br, &it, &want);
+		n = block_iter_seek_key(&it, &br, &want);
 		EXPECT(n == 0);
 
 		n = block_iter_next(&it, &rec);
diff --git a/reftable/error.c b/reftable/error.c
index 0d1766735e..a25f28a43e 100644
--- a/reftable/error.c
+++ b/reftable/error.c
@@ -22,19 +22,19 @@ const char *reftable_error_str(int err)
 	case REFTABLE_NOT_EXIST_ERROR:
 		return "file does not exist";
 	case REFTABLE_LOCK_ERROR:
-		return "data is outdated";
+		return "data is locked";
 	case REFTABLE_API_ERROR:
 		return "misuse of the reftable API";
 	case REFTABLE_ZLIB_ERROR:
 		return "zlib failure";
-	case REFTABLE_NAME_CONFLICT:
-		return "file/directory conflict";
 	case REFTABLE_EMPTY_TABLE_ERROR:
 		return "wrote empty table";
 	case REFTABLE_REFNAME_ERROR:
 		return "invalid refname";
 	case REFTABLE_ENTRY_TOO_BIG_ERROR:
 		return "entry too large";
+	case REFTABLE_OUTDATED_ERROR:
+		return "data concurrently modified";
 	case -1:
 		return "general error";
 	default:
diff --git a/reftable/iter.c b/reftable/iter.c
index 8b5ebf6183..aa9ac199b1 100644
--- a/reftable/iter.c
+++ b/reftable/iter.c
@@ -16,11 +16,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reader.h"
 #include "reftable-error.h"
 
-int iterator_is_null(struct reftable_iterator *it)
-{
-	return !it->ops;
-}
-
 static void filtering_ref_iterator_close(void *iter_arg)
 {
 	struct filtering_ref_iterator *fri = iter_arg;
@@ -120,7 +115,7 @@ static int indexed_table_ref_iter_next_block(struct indexed_table_ref_iter *it)
 		/* indexed block does not exist. */
 		return REFTABLE_FORMAT_ERROR;
 	}
-	block_reader_start(&it->block_reader, &it->cur);
+	block_iter_seek_start(&it->cur, &it->block_reader);
 	return 0;
 }
 
diff --git a/reftable/iter.h b/reftable/iter.h
index 47d67d84df..537431baba 100644
--- a/reftable/iter.h
+++ b/reftable/iter.h
@@ -16,10 +16,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-iterator.h"
 #include "reftable-generic.h"
 
-/* Returns true for a zeroed out iterator, such as the one returned from
- * iterator_destroy. */
-int iterator_is_null(struct reftable_iterator *it);
-
 /* iterator that produces only ref records that point to `oid` */
 struct filtering_ref_iterator {
 	int double_check;
diff --git a/reftable/merged.c b/reftable/merged.c
index a0f222e07b..f85a24c678 100644
--- a/reftable/merged.c
+++ b/reftable/merged.c
@@ -17,23 +17,37 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-error.h"
 #include "system.h"
 
+struct merged_subiter {
+	struct reftable_iterator iter;
+	struct reftable_record rec;
+};
+
+struct merged_iter {
+	struct merged_subiter *subiters;
+	struct merged_iter_pqueue pq;
+	uint32_t hash_id;
+	size_t stack_len;
+	uint8_t typ;
+	int suppress_deletions;
+	ssize_t advance_index;
+};
+
 static int merged_iter_init(struct merged_iter *mi)
 {
 	for (size_t i = 0; i < mi->stack_len; i++) {
 		struct pq_entry e = {
 			.index = i,
+			.rec = &mi->subiters[i].rec,
 		};
 		int err;
 
-		reftable_record_init(&e.rec, mi->typ);
-		err = iterator_next(&mi->stack[i], &e.rec);
+		reftable_record_init(&mi->subiters[i].rec, mi->typ);
+		err = iterator_next(&mi->subiters[i].iter,
+				    &mi->subiters[i].rec);
 		if (err < 0)
 			return err;
-		if (err > 0) {
-			reftable_iterator_destroy(&mi->stack[i]);
-			reftable_record_release(&e.rec);
+		if (err > 0)
 			continue;
-		}
 
 		merged_iter_pqueue_add(&mi->pq, &e);
 	}
@@ -46,56 +60,66 @@ static void merged_iter_close(void *p)
 	struct merged_iter *mi = p;
 
 	merged_iter_pqueue_release(&mi->pq);
-	for (size_t i = 0; i < mi->stack_len; i++)
-		reftable_iterator_destroy(&mi->stack[i]);
-	reftable_free(mi->stack);
-	strbuf_release(&mi->key);
-	strbuf_release(&mi->entry_key);
+	for (size_t i = 0; i < mi->stack_len; i++) {
+		reftable_iterator_destroy(&mi->subiters[i].iter);
+		reftable_record_release(&mi->subiters[i].rec);
+	}
+	reftable_free(mi->subiters);
 }
 
-static int merged_iter_advance_nonnull_subiter(struct merged_iter *mi,
-					       size_t idx)
+static int merged_iter_advance_subiter(struct merged_iter *mi, size_t idx)
 {
 	struct pq_entry e = {
 		.index = idx,
+		.rec = &mi->subiters[idx].rec,
 	};
 	int err;
 
-	reftable_record_init(&e.rec, mi->typ);
-	err = iterator_next(&mi->stack[idx], &e.rec);
-	if (err < 0)
+	err = iterator_next(&mi->subiters[idx].iter, &mi->subiters[idx].rec);
+	if (err)
 		return err;
 
-	if (err > 0) {
-		reftable_iterator_destroy(&mi->stack[idx]);
-		reftable_record_release(&e.rec);
-		return 0;
-	}
-
 	merged_iter_pqueue_add(&mi->pq, &e);
 	return 0;
 }
 
-static int merged_iter_advance_subiter(struct merged_iter *mi, size_t idx)
-{
-	if (iterator_is_null(&mi->stack[idx]))
-		return 0;
-	return merged_iter_advance_nonnull_subiter(mi, idx);
-}
-
 static int merged_iter_next_entry(struct merged_iter *mi,
 				  struct reftable_record *rec)
 {
 	struct pq_entry entry = { 0 };
-	int err = 0;
+	int err = 0, empty;
+
+	empty = merged_iter_pqueue_is_empty(mi->pq);
+
+	if (mi->advance_index >= 0) {
+		/*
+		 * When there are no pqueue entries then we only have a single
+		 * subiter left. There is no need to use the pqueue in that
+		 * case anymore as we know that the subiter will return entries
+		 * in the correct order already.
+		 *
+		 * While this may sound like a very specific edge case, it may
+		 * happen more frequently than you think. Most repositories
+		 * will end up having a single large base table that contains
+		 * most of the refs. It's thus likely that we exhaust all
+		 * subiters but the one from that base ref.
+		 */
+		if (empty)
+			return iterator_next(&mi->subiters[mi->advance_index].iter,
+					     rec);
+
+		err = merged_iter_advance_subiter(mi, mi->advance_index);
+		if (err < 0)
+			return err;
+		if (!err)
+			empty = 0;
+		mi->advance_index = -1;
+	}
 
-	if (merged_iter_pqueue_is_empty(mi->pq))
+	if (empty)
 		return 1;
 
 	entry = merged_iter_pqueue_remove(&mi->pq);
-	err = merged_iter_advance_subiter(mi, entry.index);
-	if (err < 0)
-		return err;
 
 	/*
 	  One can also use reftable as datacenter-local storage, where the ref
@@ -105,55 +129,38 @@ static int merged_iter_next_entry(struct merged_iter *mi,
 	  such a deployment, the loop below must be changed to collect all
 	  entries for the same key, and return new the newest one.
 	*/
-	reftable_record_key(&entry.rec, &mi->entry_key);
 	while (!merged_iter_pqueue_is_empty(mi->pq)) {
 		struct pq_entry top = merged_iter_pqueue_top(mi->pq);
-		int cmp = 0;
-
-		reftable_record_key(&top.rec, &mi->key);
+		int cmp;
 
-		cmp = strbuf_cmp(&mi->key, &mi->entry_key);
+		cmp = reftable_record_cmp(top.rec, entry.rec);
 		if (cmp > 0)
 			break;
 
 		merged_iter_pqueue_remove(&mi->pq);
 		err = merged_iter_advance_subiter(mi, top.index);
 		if (err < 0)
-			goto done;
-		reftable_record_release(&top.rec);
+			return err;
 	}
 
-	reftable_record_release(rec);
-	*rec = entry.rec;
-
-done:
-	if (err)
-		reftable_record_release(&entry.rec);
-	return err;
+	mi->advance_index = entry.index;
+	SWAP(*rec, *entry.rec);
+	return 0;
 }
 
-static int merged_iter_next(struct merged_iter *mi, struct reftable_record *rec)
+static int merged_iter_next_void(void *p, struct reftable_record *rec)
 {
+	struct merged_iter *mi = p;
 	while (1) {
 		int err = merged_iter_next_entry(mi, rec);
-		if (err == 0 && mi->suppress_deletions &&
-		    reftable_record_is_deletion(rec)) {
+		if (err)
+			return err;
+		if (mi->suppress_deletions && reftable_record_is_deletion(rec))
 			continue;
-		}
-
-		return err;
+		return 0;
 	}
 }
 
-static int merged_iter_next_void(void *p, struct reftable_record *rec)
-{
-	struct merged_iter *mi = p;
-	if (merged_iter_pqueue_is_empty(mi->pq))
-		return 1;
-
-	return merged_iter_next(mi, rec);
-}
-
 static struct reftable_iterator_vtable merged_iter_vtable = {
 	.next = &merged_iter_next_void,
 	.close = &merged_iter_close,
@@ -243,16 +250,15 @@ static int merged_table_seek_record(struct reftable_merged_table *mt,
 		.typ = reftable_record_type(rec),
 		.hash_id = mt->hash_id,
 		.suppress_deletions = mt->suppress_deletions,
-		.key = STRBUF_INIT,
-		.entry_key = STRBUF_INIT,
+		.advance_index = -1,
 	};
 	struct merged_iter *p;
 	int err;
 
-	REFTABLE_CALLOC_ARRAY(merged.stack, mt->stack_len);
+	REFTABLE_CALLOC_ARRAY(merged.subiters, mt->stack_len);
 	for (size_t i = 0; i < mt->stack_len; i++) {
 		err = reftable_table_seek_record(&mt->stack[i],
-						 &merged.stack[merged.stack_len], rec);
+						 &merged.subiters[merged.stack_len].iter, rec);
 		if (err < 0)
 			goto out;
 		if (!err)
diff --git a/reftable/merged.h b/reftable/merged.h
index d5b39dfe7f..a2571dbc99 100644
--- a/reftable/merged.h
+++ b/reftable/merged.h
@@ -9,7 +9,7 @@ https://developers.google.com/open-source/licenses/bsd
 #ifndef MERGED_H
 #define MERGED_H
 
-#include "pq.h"
+#include "system.h"
 
 struct reftable_merged_table {
 	struct reftable_table *stack;
@@ -24,17 +24,6 @@ struct reftable_merged_table {
 	uint64_t max;
 };
 
-struct merged_iter {
-	struct reftable_iterator *stack;
-	uint32_t hash_id;
-	size_t stack_len;
-	uint8_t typ;
-	int suppress_deletions;
-	struct merged_iter_pqueue pq;
-	struct strbuf key;
-	struct strbuf entry_key;
-};
-
 void merged_table_release(struct reftable_merged_table *mt);
 
 #endif
diff --git a/reftable/merged_test.c b/reftable/merged_test.c
index d0f77a3b8f..530fc82d1c 100644
--- a/reftable/merged_test.c
+++ b/reftable/merged_test.c
@@ -289,16 +289,13 @@ merged_table_from_log_records(struct reftable_log_record **logs,
 
 static void test_merged_logs(void)
 {
-	uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 };
-	uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 };
-	uint8_t hash3[GIT_SHA1_RAWSZ] = { 3 };
 	struct reftable_log_record r1[] = {
 		{
 			.refname = "a",
 			.update_index = 2,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.old_hash = hash2,
+				.old_hash = { 2 },
 				/* deletion */
 				.name = "jane doe",
 				.email = "jane@invalid",
@@ -310,8 +307,8 @@ static void test_merged_logs(void)
 			.update_index = 1,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.old_hash = hash1,
-				.new_hash = hash2,
+				.old_hash = { 1 },
+				.new_hash = { 2 },
 				.name = "jane doe",
 				.email = "jane@invalid",
 				.message = "message1",
@@ -324,7 +321,7 @@ static void test_merged_logs(void)
 			.update_index = 3,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.new_hash = hash3,
+				.new_hash = { 3 },
 				.name = "jane doe",
 				.email = "jane@invalid",
 				.message = "message3",
diff --git a/reftable/pq.c b/reftable/pq.c
index 2461daf5ff..7fb45d8c60 100644
--- a/reftable/pq.c
+++ b/reftable/pq.c
@@ -14,33 +14,12 @@ https://developers.google.com/open-source/licenses/bsd
 
 int pq_less(struct pq_entry *a, struct pq_entry *b)
 {
-	struct strbuf ak = STRBUF_INIT;
-	struct strbuf bk = STRBUF_INIT;
-	int cmp = 0;
-	reftable_record_key(&a->rec, &ak);
-	reftable_record_key(&b->rec, &bk);
-
-	cmp = strbuf_cmp(&ak, &bk);
-
-	strbuf_release(&ak);
-	strbuf_release(&bk);
-
+	int cmp = reftable_record_cmp(a->rec, b->rec);
 	if (cmp == 0)
 		return a->index > b->index;
-
 	return cmp < 0;
 }
 
-struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq)
-{
-	return pq.heap[0];
-}
-
-int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq)
-{
-	return pq.len == 0;
-}
-
 struct pq_entry merged_iter_pqueue_remove(struct merged_iter_pqueue *pq)
 {
 	int i = 0;
@@ -93,10 +72,6 @@ void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, const struct pq_entry
 
 void merged_iter_pqueue_release(struct merged_iter_pqueue *pq)
 {
-	int i = 0;
-	for (i = 0; i < pq->len; i++) {
-		reftable_record_release(&pq->heap[i].rec);
-	}
 	FREE_AND_NULL(pq->heap);
-	pq->len = pq->cap = 0;
+	memset(pq, 0, sizeof(*pq));
 }
diff --git a/reftable/pq.h b/reftable/pq.h
index e85bac9b52..f796c23179 100644
--- a/reftable/pq.h
+++ b/reftable/pq.h
@@ -12,8 +12,8 @@ https://developers.google.com/open-source/licenses/bsd
 #include "record.h"
 
 struct pq_entry {
-	int index;
-	struct reftable_record rec;
+	size_t index;
+	struct reftable_record *rec;
 };
 
 struct merged_iter_pqueue {
@@ -22,12 +22,20 @@ struct merged_iter_pqueue {
 	size_t cap;
 };
 
-struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq);
-int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq);
 void merged_iter_pqueue_check(struct merged_iter_pqueue pq);
 struct pq_entry merged_iter_pqueue_remove(struct merged_iter_pqueue *pq);
 void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, const struct pq_entry *e);
 void merged_iter_pqueue_release(struct merged_iter_pqueue *pq);
 int pq_less(struct pq_entry *a, struct pq_entry *b);
 
+static inline struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq)
+{
+	return pq.heap[0];
+}
+
+static inline int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq)
+{
+	return pq.len == 0;
+}
+
 #endif
diff --git a/reftable/pq_test.c b/reftable/pq_test.c
index c202eff848..b7d3c80cc7 100644
--- a/reftable/pq_test.c
+++ b/reftable/pq_test.c
@@ -27,48 +27,43 @@ void merged_iter_pqueue_check(struct merged_iter_pqueue pq)
 
 static void test_pq(void)
 {
-	char *names[54] = { NULL };
-	int N = ARRAY_SIZE(names) - 1;
-
 	struct merged_iter_pqueue pq = { NULL };
+	struct reftable_record recs[54];
+	int N = ARRAY_SIZE(recs) - 1, i;
 	char *last = NULL;
 
-	int i = 0;
 	for (i = 0; i < N; i++) {
-		char name[100];
-		snprintf(name, sizeof(name), "%02d", i);
-		names[i] = xstrdup(name);
+		struct strbuf refname = STRBUF_INIT;
+		strbuf_addf(&refname, "%02d", i);
+
+		reftable_record_init(&recs[i], BLOCK_TYPE_REF);
+		recs[i].u.ref.refname = strbuf_detach(&refname, NULL);
 	}
 
 	i = 1;
 	do {
-		struct pq_entry e = { .rec = { .type = BLOCK_TYPE_REF,
-					       .u.ref = {
-						       .refname = names[i],
-					       } } };
+		struct pq_entry e = {
+			.rec = &recs[i],
+		};
+
 		merged_iter_pqueue_add(&pq, &e);
 		merged_iter_pqueue_check(pq);
+
 		i = (i * 7) % N;
 	} while (i != 1);
 
 	while (!merged_iter_pqueue_is_empty(pq)) {
 		struct pq_entry e = merged_iter_pqueue_remove(&pq);
-		struct reftable_record *rec = &e.rec;
 		merged_iter_pqueue_check(pq);
 
-		EXPECT(reftable_record_type(rec) == BLOCK_TYPE_REF);
-		if (last) {
-			EXPECT(strcmp(last, rec->u.ref.refname) < 0);
-		}
-		/* this is names[i], so don't dealloc. */
-		last = rec->u.ref.refname;
-		rec->u.ref.refname = NULL;
-		reftable_record_release(rec);
-	}
-	for (i = 0; i < N; i++) {
-		reftable_free(names[i]);
+		EXPECT(reftable_record_type(e.rec) == BLOCK_TYPE_REF);
+		if (last)
+			EXPECT(strcmp(last, e.rec->u.ref.refname) < 0);
+		last = e.rec->u.ref.refname;
 	}
 
+	for (i = 0; i < N; i++)
+		reftable_record_release(&recs[i]);
 	merged_iter_pqueue_release(&pq);
 }
 
diff --git a/reftable/reader.c b/reftable/reader.c
index 2663b03938..481dff10d4 100644
--- a/reftable/reader.c
+++ b/reftable/reader.c
@@ -220,6 +220,7 @@ struct table_iter {
 	struct reftable_reader *r;
 	uint8_t typ;
 	uint64_t block_off;
+	struct block_reader br;
 	struct block_iter bi;
 	int is_finished;
 };
@@ -227,16 +228,6 @@ struct table_iter {
 	.bi = BLOCK_ITER_INIT \
 }
 
-static void table_iter_copy_from(struct table_iter *dest,
-				 struct table_iter *src)
-{
-	dest->r = src->r;
-	dest->typ = src->typ;
-	dest->block_off = src->block_off;
-	dest->is_finished = src->is_finished;
-	block_iter_copy_from(&dest->bi, &src->bi);
-}
-
 static int table_iter_next_in_block(struct table_iter *ti,
 				    struct reftable_record *rec)
 {
@@ -250,14 +241,8 @@ static int table_iter_next_in_block(struct table_iter *ti,
 
 static void table_iter_block_done(struct table_iter *ti)
 {
-	if (!ti->bi.br) {
-		return;
-	}
-	reftable_block_done(&ti->bi.br->block);
-	FREE_AND_NULL(ti->bi.br);
-
-	ti->bi.last_key.len = 0;
-	ti->bi.next_off = 0;
+	block_reader_release(&ti->br);
+	block_iter_reset(&ti->bi);
 }
 
 static int32_t extract_block_size(uint8_t *data, uint8_t *typ, uint64_t off,
@@ -321,32 +306,27 @@ done:
 	return err;
 }
 
-static int table_iter_next_block(struct table_iter *dest,
-				 struct table_iter *src)
+static void table_iter_close(struct table_iter *ti)
 {
-	uint64_t next_block_off = src->block_off + src->bi.br->full_block_size;
-	struct block_reader br = { 0 };
-	int err = 0;
+	table_iter_block_done(ti);
+	block_iter_close(&ti->bi);
+}
 
-	dest->r = src->r;
-	dest->typ = src->typ;
-	dest->block_off = next_block_off;
+static int table_iter_next_block(struct table_iter *ti)
+{
+	uint64_t next_block_off = ti->block_off + ti->br.full_block_size;
+	int err;
 
-	err = reader_init_block_reader(src->r, &br, next_block_off, src->typ);
-	if (err > 0) {
-		dest->is_finished = 1;
-		return 1;
-	}
-	if (err != 0)
+	err = reader_init_block_reader(ti->r, &ti->br, next_block_off, ti->typ);
+	if (err > 0)
+		ti->is_finished = 1;
+	if (err)
 		return err;
-	else {
-		struct block_reader *brp =
-			reftable_malloc(sizeof(struct block_reader));
-		*brp = br;
 
-		dest->is_finished = 0;
-		block_reader_start(brp, &dest->bi);
-	}
+	ti->block_off = next_block_off;
+	ti->is_finished = 0;
+	block_iter_seek_start(&ti->bi, &ti->br);
+
 	return 0;
 }
 
@@ -356,27 +336,30 @@ static int table_iter_next(struct table_iter *ti, struct reftable_record *rec)
 		return REFTABLE_API_ERROR;
 
 	while (1) {
-		struct table_iter next = TABLE_ITER_INIT;
-		int err = 0;
-		if (ti->is_finished) {
+		int err;
+
+		if (ti->is_finished)
 			return 1;
-		}
 
+		/*
+		 * Check whether the current block still has more records. If
+		 * so, return it. If the iterator returns positive then the
+		 * current block has been exhausted.
+		 */
 		err = table_iter_next_in_block(ti, rec);
-		if (err <= 0) {
+		if (err <= 0)
 			return err;
-		}
 
-		err = table_iter_next_block(&next, ti);
-		if (err != 0) {
+		/*
+		 * Otherwise, we need to continue to the next block in the
+		 * table and retry. If there are no more blocks then the
+		 * iterator is drained.
+		 */
+		err = table_iter_next_block(ti);
+		if (err) {
 			ti->is_finished = 1;
-		}
-		table_iter_block_done(ti);
-		if (err != 0) {
 			return err;
 		}
-		table_iter_copy_from(ti, &next);
-		block_iter_close(&next.bi);
 	}
 }
 
@@ -385,16 +368,14 @@ static int table_iter_next_void(void *ti, struct reftable_record *rec)
 	return table_iter_next(ti, rec);
 }
 
-static void table_iter_close(void *p)
+static void table_iter_close_void(void *ti)
 {
-	struct table_iter *ti = p;
-	table_iter_block_done(ti);
-	block_iter_close(&ti->bi);
+	table_iter_close(ti);
 }
 
 static struct reftable_iterator_vtable table_iter_vtable = {
 	.next = &table_iter_next_void,
-	.close = &table_iter_close,
+	.close = &table_iter_close_void,
 };
 
 static void iterator_from_table_iter(struct reftable_iterator *it,
@@ -409,19 +390,16 @@ static int reader_table_iter_at(struct reftable_reader *r,
 				struct table_iter *ti, uint64_t off,
 				uint8_t typ)
 {
-	struct block_reader br = { 0 };
-	struct block_reader *brp = NULL;
+	int err;
 
-	int err = reader_init_block_reader(r, &br, off, typ);
+	err = reader_init_block_reader(r, &ti->br, off, typ);
 	if (err != 0)
 		return err;
 
-	brp = reftable_malloc(sizeof(struct block_reader));
-	*brp = br;
 	ti->r = r;
-	ti->typ = block_reader_type(brp);
+	ti->typ = block_reader_type(&ti->br);
 	ti->block_off = off;
-	block_reader_start(brp, &ti->bi);
+	block_iter_seek_start(&ti->bi, &ti->br);
 	return 0;
 }
 
@@ -446,23 +424,52 @@ static int reader_seek_linear(struct table_iter *ti,
 {
 	struct strbuf want_key = STRBUF_INIT;
 	struct strbuf got_key = STRBUF_INIT;
-	struct table_iter next = TABLE_ITER_INIT;
 	struct reftable_record rec;
 	int err = -1;
 
 	reftable_record_init(&rec, reftable_record_type(want));
 	reftable_record_key(want, &want_key);
 
+	/*
+	 * First we need to locate the block that must contain our record. To
+	 * do so we scan through blocks linearly until we find the first block
+	 * whose first key is bigger than our wanted key. Once we have found
+	 * that block we know that the key must be contained in the preceding
+	 * block.
+	 *
+	 * This algorithm is somewhat unfortunate because it means that we
+	 * always have to seek one block too far and then back up. But as we
+	 * can only decode the _first_ key of a block but not its _last_ key we
+	 * have no other way to do this.
+	 */
 	while (1) {
-		err = table_iter_next_block(&next, ti);
+		struct table_iter next = *ti;
+
+		/*
+		 * We must be careful to not modify underlying data of `ti`
+		 * because we may find that `next` does not contain our desired
+		 * block, but that `ti` does. In that case, we would discard
+		 * `next` and continue with `ti`.
+		 *
+		 * This also means that we cannot reuse allocated memory for
+		 * `next` here. While it would be great if we could, it should
+		 * in practice not be too bad given that we should only ever
+		 * end up doing linear seeks with at most three blocks. As soon
+		 * as we have more than three blocks we would have an index, so
+		 * we would not do a linear search there anymore.
+		 */
+		memset(&next.br.block, 0, sizeof(next.br.block));
+		next.br.zstream = NULL;
+		next.br.uncompressed_data = NULL;
+		next.br.uncompressed_cap = 0;
+
+		err = table_iter_next_block(&next);
 		if (err < 0)
 			goto done;
-
-		if (err > 0) {
+		if (err > 0)
 			break;
-		}
 
-		err = block_reader_first_key(next.bi.br, &got_key);
+		err = block_reader_first_key(&next.br, &got_key);
 		if (err < 0)
 			goto done;
 
@@ -472,16 +479,20 @@ static int reader_seek_linear(struct table_iter *ti,
 		}
 
 		table_iter_block_done(ti);
-		table_iter_copy_from(ti, &next);
+		*ti = next;
 	}
 
-	err = block_iter_seek(&ti->bi, &want_key);
+	/*
+	 * We have located the block that must contain our record, so we seek
+	 * the wanted key inside of it. If the block does not contain our key
+	 * we know that the corresponding record does not exist.
+	 */
+	err = block_iter_seek_key(&ti->bi, &ti->br, &want_key);
 	if (err < 0)
 		goto done;
 	err = 0;
 
 done:
-	block_iter_close(&next.bi);
 	reftable_record_release(&rec);
 	strbuf_release(&want_key);
 	strbuf_release(&got_key);
@@ -500,6 +511,7 @@ static int reader_seek_indexed(struct reftable_reader *r,
 		.u.idx = { .last_key = STRBUF_INIT },
 	};
 	struct table_iter index_iter = TABLE_ITER_INIT;
+	struct table_iter empty = TABLE_ITER_INIT;
 	struct table_iter next = TABLE_ITER_INIT;
 	int err = 0;
 
@@ -541,7 +553,6 @@ static int reader_seek_indexed(struct reftable_reader *r,
 		 * not exist.
 		 */
 		err = table_iter_next(&index_iter, &index_result);
-		table_iter_block_done(&index_iter);
 		if (err != 0)
 			goto done;
 
@@ -550,7 +561,7 @@ static int reader_seek_indexed(struct reftable_reader *r,
 		if (err != 0)
 			goto done;
 
-		err = block_iter_seek(&next.bi, &want_index.u.idx.last_key);
+		err = block_iter_seek_key(&next.bi, &next.br, &want_index.u.idx.last_key);
 		if (err < 0)
 			goto done;
 
@@ -564,18 +575,20 @@ static int reader_seek_indexed(struct reftable_reader *r,
 			break;
 		}
 
-		table_iter_copy_from(&index_iter, &next);
+		table_iter_close(&index_iter);
+		index_iter = next;
+		next = empty;
 	}
 
 	if (err == 0) {
-		struct table_iter empty = TABLE_ITER_INIT;
 		struct table_iter *malloced = reftable_calloc(1, sizeof(*malloced));
-		*malloced = empty;
-		table_iter_copy_from(malloced, &next);
+		*malloced = next;
+		next = empty;
 		iterator_from_table_iter(it, malloced);
 	}
+
 done:
-	block_iter_close(&next.bi);
+	table_iter_close(&next);
 	table_iter_close(&index_iter);
 	reftable_record_release(&want_index);
 	reftable_record_release(&index_result);
@@ -589,25 +602,28 @@ static int reader_seek_internal(struct reftable_reader *r,
 	struct reftable_reader_offsets *offs =
 		reader_offsets_for(r, reftable_record_type(rec));
 	uint64_t idx = offs->index_offset;
-	struct table_iter ti = TABLE_ITER_INIT;
-	int err = 0;
+	struct table_iter ti = TABLE_ITER_INIT, *p;
+	int err;
+
 	if (idx > 0)
 		return reader_seek_indexed(r, it, rec);
 
 	err = reader_start(r, &ti, reftable_record_type(rec), 0);
 	if (err < 0)
-		return err;
+		goto out;
+
 	err = reader_seek_linear(&ti, rec);
 	if (err < 0)
-		return err;
-	else {
-		struct table_iter *p =
-			reftable_malloc(sizeof(struct table_iter));
-		*p = ti;
-		iterator_from_table_iter(it, p);
-	}
+		goto out;
 
-	return 0;
+	REFTABLE_ALLOC_ARRAY(p, 1);
+	*p = ti;
+	iterator_from_table_iter(it, p);
+
+out:
+	if (err)
+		table_iter_close(&ti);
+	return err;
 }
 
 static int reader_seek(struct reftable_reader *r, struct reftable_iterator *it,
diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c
index 363fe0f998..a6dbd214c5 100644
--- a/reftable/readwrite_test.c
+++ b/reftable/readwrite_test.c
@@ -77,18 +77,15 @@ static void write_table(char ***names, struct strbuf *buf, int N,
 	}
 
 	for (i = 0; i < N; i++) {
-		uint8_t hash[GIT_SHA256_RAWSZ] = { 0 };
 		char name[100];
 		int n;
 
-		set_test_hash(hash, i);
-
 		snprintf(name, sizeof(name), "refs/heads/branch%02d", i);
 
 		log.refname = name;
 		log.update_index = update_index;
 		log.value_type = REFTABLE_LOG_UPDATE;
-		log.value.update.new_hash = hash;
+		set_test_hash(log.value.update.new_hash, i);
 		log.value.update.message = "message";
 
 		n = reftable_writer_add_log(w, &log);
@@ -137,13 +134,10 @@ static void test_log_buffer_size(void)
 	/* This tests buffer extension for log compression. Must use a random
 	   hash, to ensure that the compressed part is larger than the original.
 	*/
-	uint8_t hash1[GIT_SHA1_RAWSZ], hash2[GIT_SHA1_RAWSZ];
 	for (i = 0; i < GIT_SHA1_RAWSZ; i++) {
-		hash1[i] = (uint8_t)(git_rand() % 256);
-		hash2[i] = (uint8_t)(git_rand() % 256);
+		log.value.update.old_hash[i] = (uint8_t)(git_rand() % 256);
+		log.value.update.new_hash[i] = (uint8_t)(git_rand() % 256);
 	}
-	log.value.update.old_hash = hash1;
-	log.value.update.new_hash = hash2;
 	reftable_writer_set_limits(w, update_index, update_index);
 	err = reftable_writer_add_log(w, &log);
 	EXPECT_ERR(err);
@@ -161,25 +155,26 @@ static void test_log_overflow(void)
 		.block_size = ARRAY_SIZE(msg),
 	};
 	int err;
-	struct reftable_log_record
-		log = { .refname = "refs/heads/master",
-			.update_index = 0xa,
-			.value_type = REFTABLE_LOG_UPDATE,
-			.value = { .update = {
-					   .name = "Han-Wen Nienhuys",
-					   .email = "hanwen@google.com",
-					   .tz_offset = 100,
-					   .time = 0x5e430672,
-					   .message = msg,
-				   } } };
+	struct reftable_log_record log = {
+		.refname = "refs/heads/master",
+		.update_index = 0xa,
+		.value_type = REFTABLE_LOG_UPDATE,
+		.value = {
+			.update = {
+				.old_hash = { 1 },
+				.new_hash = { 2 },
+				.name = "Han-Wen Nienhuys",
+				.email = "hanwen@google.com",
+				.tz_offset = 100,
+				.time = 0x5e430672,
+				.message = msg,
+			},
+		},
+	};
 	struct reftable_writer *w =
 		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 
-	uint8_t hash1[GIT_SHA1_RAWSZ]  = {1}, hash2[GIT_SHA1_RAWSZ] = { 2 };
-
 	memset(msg, 'x', sizeof(msg) - 1);
-	log.value.update.old_hash = hash1;
-	log.value.update.new_hash = hash2;
 	reftable_writer_set_limits(w, update_index, update_index);
 	err = reftable_writer_add_log(w, &log);
 	EXPECT(err == REFTABLE_ENTRY_TOO_BIG_ERROR);
@@ -219,16 +214,13 @@ static void test_log_write_read(void)
 		EXPECT_ERR(err);
 	}
 	for (i = 0; i < N; i++) {
-		uint8_t hash1[GIT_SHA1_RAWSZ], hash2[GIT_SHA1_RAWSZ];
 		struct reftable_log_record log = { NULL };
-		set_test_hash(hash1, i);
-		set_test_hash(hash2, i + 1);
 
 		log.refname = names[i];
 		log.update_index = i;
 		log.value_type = REFTABLE_LOG_UPDATE;
-		log.value.update.old_hash = hash1;
-		log.value.update.new_hash = hash2;
+		set_test_hash(log.value.update.old_hash, i);
+		set_test_hash(log.value.update.new_hash, i + 1);
 
 		err = reftable_writer_add_log(w, &log);
 		EXPECT_ERR(err);
@@ -298,18 +290,15 @@ static void test_log_zlib_corruption(void)
 	struct reftable_writer *w =
 		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	const struct reftable_stats *stats = NULL;
-	uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 };
-	uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 };
 	char message[100] = { 0 };
 	int err, i, n;
-
 	struct reftable_log_record log = {
 		.refname = "refname",
 		.value_type = REFTABLE_LOG_UPDATE,
 		.value = {
 			.update = {
-				.new_hash = hash1,
-				.old_hash = hash2,
+				.new_hash = { 1 },
+				.old_hash = { 2 },
 				.name = "My Name",
 				.email = "myname@invalid",
 				.message = message,
@@ -821,13 +810,12 @@ static void test_write_multiple_indices(void)
 	}
 
 	for (i = 0; i < 100; i++) {
-		unsigned char hash[GIT_SHA1_RAWSZ] = {i};
 		struct reftable_log_record log = {
 			.update_index = 1,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.old_hash = hash,
-				.new_hash = hash,
+				.old_hash = { i },
+				.new_hash = { i },
 			},
 		};
 
diff --git a/reftable/record.c b/reftable/record.c
index 26c5e43f9b..5506f3e913 100644
--- a/reftable/record.c
+++ b/reftable/record.c
@@ -159,34 +159,49 @@ int reftable_encode_key(int *restart, struct string_view dest,
 	return start.len - dest.len;
 }
 
-int reftable_decode_key(struct strbuf *key, uint8_t *extra,
-			struct strbuf last_key, struct string_view in)
+int reftable_decode_keylen(struct string_view in,
+			   uint64_t *prefix_len,
+			   uint64_t *suffix_len,
+			   uint8_t *extra)
 {
-	int start_len = in.len;
-	uint64_t prefix_len = 0;
-	uint64_t suffix_len = 0;
-	int n = get_var_int(&prefix_len, &in);
+	size_t start_len = in.len;
+	int n;
+
+	n = get_var_int(prefix_len, &in);
 	if (n < 0)
 		return -1;
 	string_view_consume(&in, n);
 
-	if (prefix_len > last_key.len)
-		return -1;
-
-	n = get_var_int(&suffix_len, &in);
+	n = get_var_int(suffix_len, &in);
 	if (n <= 0)
 		return -1;
 	string_view_consume(&in, n);
 
-	*extra = (uint8_t)(suffix_len & 0x7);
-	suffix_len >>= 3;
+	*extra = (uint8_t)(*suffix_len & 0x7);
+	*suffix_len >>= 3;
 
-	if (in.len < suffix_len)
+	return start_len - in.len;
+}
+
+int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
+			struct string_view in)
+{
+	int start_len = in.len;
+	uint64_t prefix_len = 0;
+	uint64_t suffix_len = 0;
+	int n;
+
+	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, extra);
+	if (n < 0)
 		return -1;
+	string_view_consume(&in, n);
 
-	strbuf_reset(key);
-	strbuf_add(key, last_key.buf, prefix_len);
-	strbuf_add(key, in.buf, suffix_len);
+	if (in.len < suffix_len ||
+	    prefix_len > last_key->len)
+		return -1;
+
+	strbuf_setlen(last_key, prefix_len);
+	strbuf_add(last_key, in.buf, suffix_len);
 	string_view_consume(&in, suffix_len);
 
 	return start_len - in.len;
@@ -205,14 +220,26 @@ static void reftable_ref_record_copy_from(void *rec, const void *src_rec,
 {
 	struct reftable_ref_record *ref = rec;
 	const struct reftable_ref_record *src = src_rec;
+	char *refname = NULL;
+	size_t refname_cap = 0;
+
 	assert(hash_size > 0);
 
-	/* This is simple and correct, but we could probably reuse the hash
-	 * fields. */
+	SWAP(refname, ref->refname);
+	SWAP(refname_cap, ref->refname_cap);
 	reftable_ref_record_release(ref);
+	SWAP(ref->refname, refname);
+	SWAP(ref->refname_cap, refname_cap);
+
 	if (src->refname) {
-		ref->refname = xstrdup(src->refname);
+		size_t refname_len = strlen(src->refname);
+
+		REFTABLE_ALLOC_GROW(ref->refname, refname_len + 1,
+				    ref->refname_cap);
+		memcpy(ref->refname, src->refname, refname_len);
+		ref->refname[refname_len] = 0;
 	}
+
 	ref->update_index = src->update_index;
 	ref->value_type = src->value_type;
 	switch (src->value_type) {
@@ -363,24 +390,33 @@ static int reftable_ref_record_encode(const void *rec, struct string_view s,
 
 static int reftable_ref_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch)
 {
 	struct reftable_ref_record *r = rec;
 	struct string_view start = in;
 	uint64_t update_index = 0;
-	int n = get_var_int(&update_index, &in);
+	const char *refname = NULL;
+	size_t refname_cap = 0;
+	int n;
+
+	assert(hash_size > 0);
+
+	n = get_var_int(&update_index, &in);
 	if (n < 0)
 		return n;
 	string_view_consume(&in, n);
 
+	SWAP(refname, r->refname);
+	SWAP(refname_cap, r->refname_cap);
 	reftable_ref_record_release(r);
+	SWAP(r->refname, refname);
+	SWAP(r->refname_cap, refname_cap);
 
-	assert(hash_size > 0);
-
-	r->refname = reftable_realloc(r->refname, key.len + 1);
+	REFTABLE_ALLOC_GROW(r->refname, key.len + 1, r->refname_cap);
 	memcpy(r->refname, key.buf, key.len);
-	r->update_index = update_index;
 	r->refname[key.len] = 0;
+
+	r->update_index = update_index;
 	r->value_type = val_type;
 	switch (val_type) {
 	case REFTABLE_REF_VAL1:
@@ -405,13 +441,12 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key,
 		break;
 
 	case REFTABLE_REF_SYMREF: {
-		struct strbuf dest = STRBUF_INIT;
-		int n = decode_string(&dest, in);
+		int n = decode_string(scratch, in);
 		if (n < 0) {
 			return -1;
 		}
 		string_view_consume(&in, n);
-		r->value.symref = dest.buf;
+		r->value.symref = strbuf_detach(scratch, NULL);
 	} break;
 
 	case REFTABLE_REF_DELETION:
@@ -430,7 +465,6 @@ static int reftable_ref_record_is_deletion_void(const void *p)
 		(const struct reftable_ref_record *)p);
 }
 
-
 static int reftable_ref_record_equal_void(const void *a,
 					  const void *b, int hash_size)
 {
@@ -439,6 +473,13 @@ static int reftable_ref_record_equal_void(const void *a,
 	return reftable_ref_record_equal(ra, rb, hash_size);
 }
 
+static int reftable_ref_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_ref_record *a = _a;
+	const struct reftable_ref_record *b = _b;
+	return strcmp(a->refname, b->refname);
+}
+
 static void reftable_ref_record_print_void(const void *rec,
 					   int hash_size)
 {
@@ -455,6 +496,7 @@ static struct reftable_record_vtable reftable_ref_record_vtable = {
 	.release = &reftable_ref_record_release_void,
 	.is_deletion = &reftable_ref_record_is_deletion_void,
 	.equal = &reftable_ref_record_equal_void,
+	.cmp = &reftable_ref_record_cmp_void,
 	.print = &reftable_ref_record_print_void,
 };
 
@@ -552,7 +594,7 @@ static int reftable_obj_record_encode(const void *rec, struct string_view s,
 
 static int reftable_obj_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch UNUSED)
 {
 	struct string_view start = in;
 	struct reftable_obj_record *r = rec;
@@ -561,6 +603,8 @@ static int reftable_obj_record_decode(void *rec, struct strbuf key,
 	uint64_t last;
 	int j;
 
+	reftable_obj_record_release(r);
+
 	REFTABLE_ALLOC_ARRAY(r->hash_prefix, key.len);
 	memcpy(r->hash_prefix, key.buf, key.len);
 	r->hash_prefix_len = key.len;
@@ -627,6 +671,25 @@ static int reftable_obj_record_equal_void(const void *a, const void *b, int hash
 	return 1;
 }
 
+static int reftable_obj_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_obj_record *a = _a;
+	const struct reftable_obj_record *b = _b;
+	int cmp;
+
+	cmp = memcmp(a->hash_prefix, b->hash_prefix,
+		     a->hash_prefix_len > b->hash_prefix_len ?
+		     a->hash_prefix_len : b->hash_prefix_len);
+	if (cmp)
+		return cmp;
+
+	/*
+	 * When the prefix is the same then the object record that is longer is
+	 * considered to be bigger.
+	 */
+	return a->hash_prefix_len - b->hash_prefix_len;
+}
+
 static struct reftable_record_vtable reftable_obj_record_vtable = {
 	.key = &reftable_obj_record_key,
 	.type = BLOCK_TYPE_OBJ,
@@ -637,6 +700,7 @@ static struct reftable_record_vtable reftable_obj_record_vtable = {
 	.release = &reftable_obj_record_release,
 	.is_deletion = &not_a_deletion,
 	.equal = &reftable_obj_record_equal_void,
+	.cmp = &reftable_obj_record_cmp_void,
 	.print = &reftable_obj_record_print,
 };
 
@@ -716,16 +780,10 @@ static void reftable_log_record_copy_from(void *rec, const void *src_rec,
 				xstrdup(dst->value.update.message);
 		}
 
-		if (dst->value.update.new_hash) {
-			REFTABLE_ALLOC_ARRAY(dst->value.update.new_hash, hash_size);
-			memcpy(dst->value.update.new_hash,
-			       src->value.update.new_hash, hash_size);
-		}
-		if (dst->value.update.old_hash) {
-			REFTABLE_ALLOC_ARRAY(dst->value.update.old_hash, hash_size);
-			memcpy(dst->value.update.old_hash,
-			       src->value.update.old_hash, hash_size);
-		}
+		memcpy(dst->value.update.new_hash,
+		       src->value.update.new_hash, hash_size);
+		memcpy(dst->value.update.old_hash,
+		       src->value.update.old_hash, hash_size);
 		break;
 	}
 }
@@ -743,8 +801,6 @@ void reftable_log_record_release(struct reftable_log_record *r)
 	case REFTABLE_LOG_DELETION:
 		break;
 	case REFTABLE_LOG_UPDATE:
-		reftable_free(r->value.update.new_hash);
-		reftable_free(r->value.update.old_hash);
 		reftable_free(r->value.update.name);
 		reftable_free(r->value.update.email);
 		reftable_free(r->value.update.message);
@@ -761,33 +817,20 @@ static uint8_t reftable_log_record_val_type(const void *rec)
 	return reftable_log_record_is_deletion(log) ? 0 : 1;
 }
 
-static uint8_t zero[GIT_SHA256_RAWSZ] = { 0 };
-
 static int reftable_log_record_encode(const void *rec, struct string_view s,
 				      int hash_size)
 {
 	const struct reftable_log_record *r = rec;
 	struct string_view start = s;
 	int n = 0;
-	uint8_t *oldh = NULL;
-	uint8_t *newh = NULL;
 	if (reftable_log_record_is_deletion(r))
 		return 0;
 
-	oldh = r->value.update.old_hash;
-	newh = r->value.update.new_hash;
-	if (!oldh) {
-		oldh = zero;
-	}
-	if (!newh) {
-		newh = zero;
-	}
-
 	if (s.len < 2 * hash_size)
 		return -1;
 
-	memcpy(s.buf, oldh, hash_size);
-	memcpy(s.buf + hash_size, newh, hash_size);
+	memcpy(s.buf, r->value.update.old_hash, hash_size);
+	memcpy(s.buf + hash_size, r->value.update.new_hash, hash_size);
 	string_view_consume(&s, 2 * hash_size);
 
 	n = encode_string(r->value.update.name ? r->value.update.name : "", s);
@@ -823,19 +866,18 @@ static int reftable_log_record_encode(const void *rec, struct string_view s,
 
 static int reftable_log_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch)
 {
 	struct string_view start = in;
 	struct reftable_log_record *r = rec;
 	uint64_t max = 0;
 	uint64_t ts = 0;
-	struct strbuf dest = STRBUF_INIT;
 	int n;
 
 	if (key.len <= 9 || key.buf[key.len - 9] != 0)
 		return REFTABLE_FORMAT_ERROR;
 
-	r->refname = reftable_realloc(r->refname, key.len - 8);
+	REFTABLE_ALLOC_GROW(r->refname, key.len - 8, r->refname_cap);
 	memcpy(r->refname, key.buf, key.len - 8);
 	ts = get_be64(key.buf + key.len - 8);
 
@@ -844,9 +886,8 @@ static int reftable_log_record_decode(void *rec, struct strbuf key,
 	if (val_type != r->value_type) {
 		switch (r->value_type) {
 		case REFTABLE_LOG_UPDATE:
-			FREE_AND_NULL(r->value.update.old_hash);
-			FREE_AND_NULL(r->value.update.new_hash);
 			FREE_AND_NULL(r->value.update.message);
+			r->value.update.message_cap = 0;
 			FREE_AND_NULL(r->value.update.email);
 			FREE_AND_NULL(r->value.update.name);
 			break;
@@ -862,36 +903,43 @@ static int reftable_log_record_decode(void *rec, struct strbuf key,
 	if (in.len < 2 * hash_size)
 		return REFTABLE_FORMAT_ERROR;
 
-	r->value.update.old_hash =
-		reftable_realloc(r->value.update.old_hash, hash_size);
-	r->value.update.new_hash =
-		reftable_realloc(r->value.update.new_hash, hash_size);
-
 	memcpy(r->value.update.old_hash, in.buf, hash_size);
 	memcpy(r->value.update.new_hash, in.buf + hash_size, hash_size);
 
 	string_view_consume(&in, 2 * hash_size);
 
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	r->value.update.name =
-		reftable_realloc(r->value.update.name, dest.len + 1);
-	memcpy(r->value.update.name, dest.buf, dest.len);
-	r->value.update.name[dest.len] = 0;
+	/*
+	 * In almost all cases we can expect the reflog name to not change for
+	 * reflog entries as they are tied to the local identity, not to the
+	 * target commits. As an optimization for this common case we can thus
+	 * skip copying over the name in case it's accurate already.
+	 */
+	if (!r->value.update.name ||
+	    strcmp(r->value.update.name, scratch->buf)) {
+		r->value.update.name =
+			reftable_realloc(r->value.update.name, scratch->len + 1);
+		memcpy(r->value.update.name, scratch->buf, scratch->len);
+		r->value.update.name[scratch->len] = 0;
+	}
 
-	strbuf_reset(&dest);
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	r->value.update.email =
-		reftable_realloc(r->value.update.email, dest.len + 1);
-	memcpy(r->value.update.email, dest.buf, dest.len);
-	r->value.update.email[dest.len] = 0;
+	/* Same as above, but for the reflog email. */
+	if (!r->value.update.email ||
+	    strcmp(r->value.update.email, scratch->buf)) {
+		r->value.update.email =
+			reftable_realloc(r->value.update.email, scratch->len + 1);
+		memcpy(r->value.update.email, scratch->buf, scratch->len);
+		r->value.update.email[scratch->len] = 0;
+	}
 
 	ts = 0;
 	n = get_var_int(&ts, &in);
@@ -905,22 +953,19 @@ static int reftable_log_record_decode(void *rec, struct strbuf key,
 	r->value.update.tz_offset = get_be16(in.buf);
 	string_view_consume(&in, 2);
 
-	strbuf_reset(&dest);
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	r->value.update.message =
-		reftable_realloc(r->value.update.message, dest.len + 1);
-	memcpy(r->value.update.message, dest.buf, dest.len);
-	r->value.update.message[dest.len] = 0;
+	REFTABLE_ALLOC_GROW(r->value.update.message, scratch->len + 1,
+			    r->value.update.message_cap);
+	memcpy(r->value.update.message, scratch->buf, scratch->len);
+	r->value.update.message[scratch->len] = 0;
 
-	strbuf_release(&dest);
 	return start.len - in.len;
 
 done:
-	strbuf_release(&dest);
 	return REFTABLE_FORMAT_ERROR;
 }
 
@@ -936,17 +981,6 @@ static int null_streq(char *a, char *b)
 	return 0 == strcmp(a, b);
 }
 
-static int zero_hash_eq(uint8_t *a, uint8_t *b, int sz)
-{
-	if (!a)
-		a = zero;
-
-	if (!b)
-		b = zero;
-
-	return !memcmp(a, b, sz);
-}
-
 static int reftable_log_record_equal_void(const void *a,
 					  const void *b, int hash_size)
 {
@@ -955,6 +989,22 @@ static int reftable_log_record_equal_void(const void *a,
 					 hash_size);
 }
 
+static int reftable_log_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_log_record *a = _a;
+	const struct reftable_log_record *b = _b;
+	int cmp = strcmp(a->refname, b->refname);
+	if (cmp)
+		return cmp;
+
+	/*
+	 * Note that the comparison here is reversed. This is because the
+	 * update index is reversed when comparing keys. For reference, see how
+	 * we handle this in reftable_log_record_key()`.
+	 */
+	return b->update_index - a->update_index;
+}
+
 int reftable_log_record_equal(const struct reftable_log_record *a,
 			      const struct reftable_log_record *b, int hash_size)
 {
@@ -974,10 +1024,10 @@ int reftable_log_record_equal(const struct reftable_log_record *a,
 				  b->value.update.email) &&
 		       null_streq(a->value.update.message,
 				  b->value.update.message) &&
-		       zero_hash_eq(a->value.update.old_hash,
-				    b->value.update.old_hash, hash_size) &&
-		       zero_hash_eq(a->value.update.new_hash,
-				    b->value.update.new_hash, hash_size);
+		       !memcmp(a->value.update.old_hash,
+			       b->value.update.old_hash, hash_size) &&
+		       !memcmp(a->value.update.new_hash,
+			       b->value.update.new_hash, hash_size);
 	}
 
 	abort();
@@ -1004,6 +1054,7 @@ static struct reftable_record_vtable reftable_log_record_vtable = {
 	.release = &reftable_log_record_release_void,
 	.is_deletion = &reftable_log_record_is_deletion_void,
 	.equal = &reftable_log_record_equal_void,
+	.cmp = &reftable_log_record_cmp_void,
 	.print = &reftable_log_record_print_void,
 };
 
@@ -1054,7 +1105,7 @@ static int reftable_index_record_encode(const void *rec, struct string_view out,
 
 static int reftable_index_record_decode(void *rec, struct strbuf key,
 					uint8_t val_type, struct string_view in,
-					int hash_size)
+					int hash_size, struct strbuf *scratch UNUSED)
 {
 	struct string_view start = in;
 	struct reftable_index_record *r = rec;
@@ -1079,6 +1130,13 @@ static int reftable_index_record_equal(const void *a, const void *b, int hash_si
 	return ia->offset == ib->offset && !strbuf_cmp(&ia->last_key, &ib->last_key);
 }
 
+static int reftable_index_record_cmp(const void *_a, const void *_b)
+{
+	const struct reftable_index_record *a = _a;
+	const struct reftable_index_record *b = _b;
+	return strbuf_cmp(&a->last_key, &b->last_key);
+}
+
 static void reftable_index_record_print(const void *rec, int hash_size)
 {
 	const struct reftable_index_record *idx = rec;
@@ -1096,6 +1154,7 @@ static struct reftable_record_vtable reftable_index_record_vtable = {
 	.release = &reftable_index_record_release,
 	.is_deletion = &not_a_deletion,
 	.equal = &reftable_index_record_equal,
+	.cmp = &reftable_index_record_cmp,
 	.print = &reftable_index_record_print,
 };
 
@@ -1104,11 +1163,6 @@ void reftable_record_key(struct reftable_record *rec, struct strbuf *dest)
 	reftable_record_vtable(rec)->key(reftable_record_data(rec), dest);
 }
 
-uint8_t reftable_record_type(struct reftable_record *rec)
-{
-	return rec->type;
-}
-
 int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
 			   int hash_size)
 {
@@ -1132,10 +1186,12 @@ uint8_t reftable_record_val_type(struct reftable_record *rec)
 }
 
 int reftable_record_decode(struct reftable_record *rec, struct strbuf key,
-			   uint8_t extra, struct string_view src, int hash_size)
+			   uint8_t extra, struct string_view src, int hash_size,
+			   struct strbuf *scratch)
 {
 	return reftable_record_vtable(rec)->decode(reftable_record_data(rec),
-						   key, extra, src, hash_size);
+						   key, extra, src, hash_size,
+						   scratch);
 }
 
 void reftable_record_release(struct reftable_record *rec)
@@ -1149,6 +1205,14 @@ int reftable_record_is_deletion(struct reftable_record *rec)
 		reftable_record_data(rec));
 }
 
+int reftable_record_cmp(struct reftable_record *a, struct reftable_record *b)
+{
+	if (a->type != b->type)
+		BUG("cannot compare reftable records of different type");
+	return reftable_record_vtable(a)->cmp(
+		reftable_record_data(a), reftable_record_data(b));
+}
+
 int reftable_record_equal(struct reftable_record *a, struct reftable_record *b, int hash_size)
 {
 	if (a->type != b->type)
@@ -1222,12 +1286,6 @@ int reftable_log_record_is_deletion(const struct reftable_log_record *log)
 	return (log->value_type == REFTABLE_LOG_DELETION);
 }
 
-void string_view_consume(struct string_view *s, int n)
-{
-	s->buf += n;
-	s->len -= n;
-}
-
 static void *reftable_record_data(struct reftable_record *rec)
 {
 	switch (rec->type) {
diff --git a/reftable/record.h b/reftable/record.h
index e64ed30c80..d778133e6e 100644
--- a/reftable/record.h
+++ b/reftable/record.h
@@ -25,7 +25,11 @@ struct string_view {
 };
 
 /* Advance `s.buf` by `n`, and decrease length. */
-void string_view_consume(struct string_view *s, int n);
+static inline void string_view_consume(struct string_view *s, int n)
+{
+	s->buf += n;
+	s->len -= n;
+}
 
 /* utilities for de/encoding varints */
 
@@ -51,7 +55,8 @@ struct reftable_record_vtable {
 
 	/* decode data from `src` into the record. */
 	int (*decode)(void *rec, struct strbuf key, uint8_t extra,
-		      struct string_view src, int hash_size);
+		      struct string_view src, int hash_size,
+		      struct strbuf *scratch);
 
 	/* deallocate and null the record. */
 	void (*release)(void *rec);
@@ -62,6 +67,12 @@ struct reftable_record_vtable {
 	/* Are two records equal? This assumes they have the same type. Returns 0 for non-equal. */
 	int (*equal)(const void *a, const void *b, int hash_size);
 
+	/*
+	 * Compare keys of two records with each other. The records must have
+	 * the same type.
+	 */
+	int (*cmp)(const void *a, const void *b);
+
 	/* Print on stdout, for debugging. */
 	void (*print)(const void *rec, int hash_size);
 };
@@ -75,9 +86,18 @@ int reftable_encode_key(int *is_restart, struct string_view dest,
 			struct strbuf prev_key, struct strbuf key,
 			uint8_t extra);
 
-/* Decode into `key` and `extra` from `in` */
-int reftable_decode_key(struct strbuf *key, uint8_t *extra,
-			struct strbuf last_key, struct string_view in);
+/* Decode a record's key lengths. */
+int reftable_decode_keylen(struct string_view in,
+			   uint64_t *prefix_len,
+			   uint64_t *suffix_len,
+			   uint8_t *extra);
+
+/*
+ * Decode into `last_key` and `extra` from `in`. `last_key` is expected to
+ * contain the decoded key of the preceding record, if any.
+ */
+int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
+			struct string_view in);
 
 /* reftable_index_record are used internally to speed up lookups. */
 struct reftable_index_record {
@@ -114,10 +134,10 @@ struct reftable_record {
 void reftable_record_init(struct reftable_record *rec, uint8_t typ);
 
 /* see struct record_vtable */
+int reftable_record_cmp(struct reftable_record *a, struct reftable_record *b);
 int reftable_record_equal(struct reftable_record *a, struct reftable_record *b, int hash_size);
 void reftable_record_print(struct reftable_record *rec, int hash_size);
 void reftable_record_key(struct reftable_record *rec, struct strbuf *dest);
-uint8_t reftable_record_type(struct reftable_record *rec);
 void reftable_record_copy_from(struct reftable_record *rec,
 			       struct reftable_record *src, int hash_size);
 uint8_t reftable_record_val_type(struct reftable_record *rec);
@@ -125,9 +145,14 @@ int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
 			   int hash_size);
 int reftable_record_decode(struct reftable_record *rec, struct strbuf key,
 			   uint8_t extra, struct string_view src,
-			   int hash_size);
+			   int hash_size, struct strbuf *scratch);
 int reftable_record_is_deletion(struct reftable_record *rec);
 
+static inline uint8_t reftable_record_type(struct reftable_record *rec)
+{
+	return rec->type;
+}
+
 /* frees and zeroes out the embedded record */
 void reftable_record_release(struct reftable_record *rec);
 
diff --git a/reftable/record_test.c b/reftable/record_test.c
index a86cff5526..c158ee79ff 100644
--- a/reftable/record_test.c
+++ b/reftable/record_test.c
@@ -99,6 +99,7 @@ static void set_hash(uint8_t *h, int j)
 
 static void test_reftable_ref_record_roundtrip(void)
 {
+	struct strbuf scratch = STRBUF_INIT;
 	int i = 0;
 
 	for (i = REFTABLE_REF_DELETION; i < REFTABLE_NR_REF_VALUETYPES; i++) {
@@ -140,7 +141,7 @@ static void test_reftable_ref_record_roundtrip(void)
 		EXPECT(n > 0);
 
 		/* decode into a non-zero reftable_record to test for leaks. */
-		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ);
+		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_ref_record_equal(&in.u.ref, &out.u.ref,
@@ -150,6 +151,8 @@ static void test_reftable_ref_record_roundtrip(void)
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_reftable_log_record_equal(void)
@@ -175,7 +178,6 @@ static void test_reftable_log_record_equal(void)
 static void test_reftable_log_record_roundtrip(void)
 {
 	int i;
-
 	struct reftable_log_record in[] = {
 		{
 			.refname = xstrdup("refs/heads/master"),
@@ -183,8 +185,6 @@ static void test_reftable_log_record_roundtrip(void)
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value = {
 				.update = {
-					.old_hash = reftable_malloc(GIT_SHA1_RAWSZ),
-					.new_hash = reftable_malloc(GIT_SHA1_RAWSZ),
 					.name = xstrdup("han-wen"),
 					.email = xstrdup("hanwen@google.com"),
 					.message = xstrdup("test"),
@@ -202,15 +202,10 @@ static void test_reftable_log_record_roundtrip(void)
 			.refname = xstrdup("branch"),
 			.update_index = 33,
 			.value_type = REFTABLE_LOG_UPDATE,
-			.value = {
-				.update = {
-					.old_hash = reftable_malloc(GIT_SHA1_RAWSZ),
-					.new_hash = reftable_malloc(GIT_SHA1_RAWSZ),
-					/* rest of fields left empty. */
-				},
-			},
 		}
 	};
+	struct strbuf scratch = STRBUF_INIT;
+
 	set_test_hash(in[0].value.update.new_hash, 1);
 	set_test_hash(in[0].value.update.old_hash, 2);
 	set_test_hash(in[2].value.update.new_hash, 3);
@@ -231,8 +226,6 @@ static void test_reftable_log_record_roundtrip(void)
 				.value_type = REFTABLE_LOG_UPDATE,
 				.value = {
 					.update = {
-						.new_hash = reftable_calloc(GIT_SHA1_RAWSZ, 1),
-						.old_hash = reftable_calloc(GIT_SHA1_RAWSZ, 1),
 						.name = xstrdup("old name"),
 						.email = xstrdup("old@email"),
 						.message = xstrdup("old message"),
@@ -252,7 +245,7 @@ static void test_reftable_log_record_roundtrip(void)
 		EXPECT(n >= 0);
 		valtype = reftable_record_val_type(&rec);
 		m = reftable_record_decode(&out, key, valtype, dest,
-					   GIT_SHA1_RAWSZ);
+					   GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_log_record_equal(&in[i], &out.u.log,
@@ -261,6 +254,8 @@ static void test_reftable_log_record_roundtrip(void)
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_u24_roundtrip(void)
@@ -295,7 +290,8 @@ static void test_key_roundtrip(void)
 	EXPECT(!restart);
 	EXPECT(n > 0);
 
-	m = reftable_decode_key(&roundtrip, &rt_extra, last_key, dest);
+	strbuf_addstr(&roundtrip, "refs/heads/master");
+	m = reftable_decode_key(&roundtrip, &rt_extra, dest);
 	EXPECT(n == m);
 	EXPECT(0 == strbuf_cmp(&key, &roundtrip));
 	EXPECT(rt_extra == extra);
@@ -309,23 +305,27 @@ static void test_reftable_obj_record_roundtrip(void)
 {
 	uint8_t testHash1[GIT_SHA1_RAWSZ] = { 1, 2, 3, 4, 0 };
 	uint64_t till9[] = { 1, 2, 3, 4, 500, 600, 700, 800, 9000 };
-	struct reftable_obj_record recs[3] = { {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-						       .offsets = till9,
-						       .offset_len = 3,
-					       },
-					       {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-						       .offsets = till9,
-						       .offset_len = 9,
-					       },
-					       {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-					       } };
+	struct reftable_obj_record recs[3] = {
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+			.offsets = till9,
+			.offset_len = 3,
+		},
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+			.offsets = till9,
+			.offset_len = 9,
+		},
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+		},
+	};
+	struct strbuf scratch = STRBUF_INIT;
 	int i = 0;
+
 	for (i = 0; i < ARRAY_SIZE(recs); i++) {
 		uint8_t buffer[1024] = { 0 };
 		struct string_view dest = {
@@ -349,13 +349,15 @@ static void test_reftable_obj_record_roundtrip(void)
 		EXPECT(n > 0);
 		extra = reftable_record_val_type(&in);
 		m = reftable_record_decode(&out, key, extra, dest,
-					   GIT_SHA1_RAWSZ);
+					   GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_reftable_index_record_roundtrip(void)
@@ -372,6 +374,7 @@ static void test_reftable_index_record_roundtrip(void)
 		.buf = buffer,
 		.len = sizeof(buffer),
 	};
+	struct strbuf scratch = STRBUF_INIT;
 	struct strbuf key = STRBUF_INIT;
 	struct reftable_record out = {
 		.type = BLOCK_TYPE_INDEX,
@@ -389,13 +392,15 @@ static void test_reftable_index_record_roundtrip(void)
 	EXPECT(n > 0);
 
 	extra = reftable_record_val_type(&in);
-	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ);
+	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ,
+				   &scratch);
 	EXPECT(m == n);
 
 	EXPECT(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
 
 	reftable_record_release(&out);
 	strbuf_release(&key);
+	strbuf_release(&scratch);
 	strbuf_release(&in.u.idx.last_key);
 }
 
diff --git a/reftable/refname.c b/reftable/refname.c
deleted file mode 100644
index 7570e4acf9..0000000000
--- a/reftable/refname.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
-  Copyright 2020 Google LLC
-
-  Use of this source code is governed by a BSD-style
-  license that can be found in the LICENSE file or at
-  https://developers.google.com/open-source/licenses/bsd
-*/
-
-#include "system.h"
-#include "reftable-error.h"
-#include "basics.h"
-#include "refname.h"
-#include "reftable-iterator.h"
-
-struct find_arg {
-	char **names;
-	const char *want;
-};
-
-static int find_name(size_t k, void *arg)
-{
-	struct find_arg *f_arg = arg;
-	return strcmp(f_arg->names[k], f_arg->want) >= 0;
-}
-
-static int modification_has_ref(struct modification *mod, const char *name)
-{
-	struct reftable_ref_record ref = { NULL };
-	int err = 0;
-
-	if (mod->add_len > 0) {
-		struct find_arg arg = {
-			.names = mod->add,
-			.want = name,
-		};
-		int idx = binsearch(mod->add_len, find_name, &arg);
-		if (idx < mod->add_len && !strcmp(mod->add[idx], name)) {
-			return 0;
-		}
-	}
-
-	if (mod->del_len > 0) {
-		struct find_arg arg = {
-			.names = mod->del,
-			.want = name,
-		};
-		int idx = binsearch(mod->del_len, find_name, &arg);
-		if (idx < mod->del_len && !strcmp(mod->del[idx], name)) {
-			return 1;
-		}
-	}
-
-	err = reftable_table_read_ref(&mod->tab, name, &ref);
-	reftable_ref_record_release(&ref);
-	return err;
-}
-
-static void modification_release(struct modification *mod)
-{
-	/* don't delete the strings themselves; they're owned by ref records.
-	 */
-	FREE_AND_NULL(mod->add);
-	FREE_AND_NULL(mod->del);
-	mod->add_len = 0;
-	mod->del_len = 0;
-}
-
-static int modification_has_ref_with_prefix(struct modification *mod,
-					    const char *prefix)
-{
-	struct reftable_iterator it = { NULL };
-	struct reftable_ref_record ref = { NULL };
-	int err = 0;
-
-	if (mod->add_len > 0) {
-		struct find_arg arg = {
-			.names = mod->add,
-			.want = prefix,
-		};
-		int idx = binsearch(mod->add_len, find_name, &arg);
-		if (idx < mod->add_len &&
-		    !strncmp(prefix, mod->add[idx], strlen(prefix)))
-			goto done;
-	}
-	err = reftable_table_seek_ref(&mod->tab, &it, prefix);
-	if (err)
-		goto done;
-
-	while (1) {
-		err = reftable_iterator_next_ref(&it, &ref);
-		if (err)
-			goto done;
-
-		if (mod->del_len > 0) {
-			struct find_arg arg = {
-				.names = mod->del,
-				.want = ref.refname,
-			};
-			int idx = binsearch(mod->del_len, find_name, &arg);
-			if (idx < mod->del_len &&
-			    !strcmp(ref.refname, mod->del[idx])) {
-				continue;
-			}
-		}
-
-		if (strncmp(ref.refname, prefix, strlen(prefix))) {
-			err = 1;
-			goto done;
-		}
-		err = 0;
-		goto done;
-	}
-
-done:
-	reftable_ref_record_release(&ref);
-	reftable_iterator_destroy(&it);
-	return err;
-}
-
-static int validate_refname(const char *name)
-{
-	while (1) {
-		char *next = strchr(name, '/');
-		if (!*name) {
-			return REFTABLE_REFNAME_ERROR;
-		}
-		if (!next) {
-			return 0;
-		}
-		if (next - name == 0 || (next - name == 1 && *name == '.') ||
-		    (next - name == 2 && name[0] == '.' && name[1] == '.'))
-			return REFTABLE_REFNAME_ERROR;
-		name = next + 1;
-	}
-	return 0;
-}
-
-int validate_ref_record_addition(struct reftable_table tab,
-				 struct reftable_ref_record *recs, size_t sz)
-{
-	struct modification mod = {
-		.tab = tab,
-		.add = reftable_calloc(sz, sizeof(*mod.add)),
-		.del = reftable_calloc(sz, sizeof(*mod.del)),
-	};
-	int i = 0;
-	int err = 0;
-	for (; i < sz; i++) {
-		if (reftable_ref_record_is_deletion(&recs[i])) {
-			mod.del[mod.del_len++] = recs[i].refname;
-		} else {
-			mod.add[mod.add_len++] = recs[i].refname;
-		}
-	}
-
-	err = modification_validate(&mod);
-	modification_release(&mod);
-	return err;
-}
-
-static void strbuf_trim_component(struct strbuf *sl)
-{
-	while (sl->len > 0) {
-		int is_slash = (sl->buf[sl->len - 1] == '/');
-		strbuf_setlen(sl, sl->len - 1);
-		if (is_slash)
-			break;
-	}
-}
-
-int modification_validate(struct modification *mod)
-{
-	struct strbuf slashed = STRBUF_INIT;
-	int err = 0;
-	int i = 0;
-	for (; i < mod->add_len; i++) {
-		err = validate_refname(mod->add[i]);
-		if (err)
-			goto done;
-		strbuf_reset(&slashed);
-		strbuf_addstr(&slashed, mod->add[i]);
-		strbuf_addstr(&slashed, "/");
-
-		err = modification_has_ref_with_prefix(mod, slashed.buf);
-		if (err == 0) {
-			err = REFTABLE_NAME_CONFLICT;
-			goto done;
-		}
-		if (err < 0)
-			goto done;
-
-		strbuf_reset(&slashed);
-		strbuf_addstr(&slashed, mod->add[i]);
-		while (slashed.len) {
-			strbuf_trim_component(&slashed);
-			err = modification_has_ref(mod, slashed.buf);
-			if (err == 0) {
-				err = REFTABLE_NAME_CONFLICT;
-				goto done;
-			}
-			if (err < 0)
-				goto done;
-		}
-	}
-	err = 0;
-done:
-	strbuf_release(&slashed);
-	return err;
-}
diff --git a/reftable/refname.h b/reftable/refname.h
deleted file mode 100644
index a24b40fcb4..0000000000
--- a/reftable/refname.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
-  Copyright 2020 Google LLC
-
-  Use of this source code is governed by a BSD-style
-  license that can be found in the LICENSE file or at
-  https://developers.google.com/open-source/licenses/bsd
-*/
-#ifndef REFNAME_H
-#define REFNAME_H
-
-#include "reftable-record.h"
-#include "reftable-generic.h"
-
-struct modification {
-	struct reftable_table tab;
-
-	char **add;
-	size_t add_len;
-
-	char **del;
-	size_t del_len;
-};
-
-int validate_ref_record_addition(struct reftable_table tab,
-				 struct reftable_ref_record *recs, size_t sz);
-
-int modification_validate(struct modification *mod);
-
-#endif
diff --git a/reftable/refname_test.c b/reftable/refname_test.c
deleted file mode 100644
index b9cc62554e..0000000000
--- a/reftable/refname_test.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
-Copyright 2020 Google LLC
-
-Use of this source code is governed by a BSD-style
-license that can be found in the LICENSE file or at
-https://developers.google.com/open-source/licenses/bsd
-*/
-
-#include "basics.h"
-#include "block.h"
-#include "blocksource.h"
-#include "reader.h"
-#include "record.h"
-#include "refname.h"
-#include "reftable-error.h"
-#include "reftable-writer.h"
-#include "system.h"
-
-#include "test_framework.h"
-#include "reftable-tests.h"
-
-struct testcase {
-	char *add;
-	char *del;
-	int error_code;
-};
-
-static void test_conflict(void)
-{
-	struct reftable_write_options opts = { 0 };
-	struct strbuf buf = STRBUF_INIT;
-	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
-	struct reftable_ref_record rec = {
-		.refname = "a/b",
-		.value_type = REFTABLE_REF_SYMREF,
-		.value.symref = "destination", /* make sure it's not a symref.
-						*/
-		.update_index = 1,
-	};
-	int err;
-	int i;
-	struct reftable_block_source source = { NULL };
-	struct reftable_reader *rd = NULL;
-	struct reftable_table tab = { NULL };
-	struct testcase cases[] = {
-		{ "a/b/c", NULL, REFTABLE_NAME_CONFLICT },
-		{ "b", NULL, 0 },
-		{ "a", NULL, REFTABLE_NAME_CONFLICT },
-		{ "a", "a/b", 0 },
-
-		{ "p/", NULL, REFTABLE_REFNAME_ERROR },
-		{ "p//q", NULL, REFTABLE_REFNAME_ERROR },
-		{ "p/./q", NULL, REFTABLE_REFNAME_ERROR },
-		{ "p/../q", NULL, REFTABLE_REFNAME_ERROR },
-
-		{ "a/b/c", "a/b", 0 },
-		{ NULL, "a//b", 0 },
-	};
-	reftable_writer_set_limits(w, 1, 1);
-
-	err = reftable_writer_add_ref(w, &rec);
-	EXPECT_ERR(err);
-
-	err = reftable_writer_close(w);
-	EXPECT_ERR(err);
-	reftable_writer_free(w);
-
-	block_source_from_strbuf(&source, &buf);
-	err = reftable_new_reader(&rd, &source, "filename");
-	EXPECT_ERR(err);
-
-	reftable_table_from_reader(&tab, rd);
-
-	for (i = 0; i < ARRAY_SIZE(cases); i++) {
-		struct modification mod = {
-			.tab = tab,
-		};
-
-		if (cases[i].add) {
-			mod.add = &cases[i].add;
-			mod.add_len = 1;
-		}
-		if (cases[i].del) {
-			mod.del = &cases[i].del;
-			mod.del_len = 1;
-		}
-
-		err = modification_validate(&mod);
-		EXPECT(err == cases[i].error_code);
-	}
-
-	reftable_reader_free(rd);
-	strbuf_release(&buf);
-}
-
-int refname_test_main(int argc, const char *argv[])
-{
-	RUN_TEST(test_conflict);
-	return 0;
-}
diff --git a/reftable/reftable-error.h b/reftable/reftable-error.h
index 4c457aaaf8..6368cd9ed9 100644
--- a/reftable/reftable-error.h
+++ b/reftable/reftable-error.h
@@ -25,7 +25,7 @@ enum reftable_error {
 	 */
 	REFTABLE_NOT_EXIST_ERROR = -4,
 
-	/* Trying to write out-of-date data. */
+	/* Trying to access locked data. */
 	REFTABLE_LOCK_ERROR = -5,
 
 	/* Misuse of the API:
@@ -48,15 +48,15 @@ enum reftable_error {
 	/* Wrote a table without blocks. */
 	REFTABLE_EMPTY_TABLE_ERROR = -8,
 
-	/* Dir/file conflict. */
-	REFTABLE_NAME_CONFLICT = -9,
-
 	/* Invalid ref name. */
 	REFTABLE_REFNAME_ERROR = -10,
 
 	/* Entry does not fit. This can happen when writing outsize reflog
 	   messages. */
 	REFTABLE_ENTRY_TOO_BIG_ERROR = -11,
+
+	/* Trying to write out-of-date data. */
+	REFTABLE_OUTDATED_ERROR = -12,
 };
 
 /* convert the numeric error code to a string. The string should not be
diff --git a/reftable/reftable-record.h b/reftable/reftable-record.h
index bb6e99acd3..2a2943cd13 100644
--- a/reftable/reftable-record.h
+++ b/reftable/reftable-record.h
@@ -22,6 +22,7 @@ https://developers.google.com/open-source/licenses/bsd
 /* reftable_ref_record holds a ref database entry target_value */
 struct reftable_ref_record {
 	char *refname; /* Name of the ref, malloced. */
+	size_t refname_cap;
 	uint64_t update_index; /* Logical timestamp at which this value is
 				* written */
 
@@ -73,6 +74,7 @@ int reftable_ref_record_equal(const struct reftable_ref_record *a,
 /* reftable_log_record holds a reflog entry */
 struct reftable_log_record {
 	char *refname;
+	size_t refname_cap;
 	uint64_t update_index; /* logical timestamp of a transactional update.
 				*/
 
@@ -87,13 +89,14 @@ struct reftable_log_record {
 
 	union {
 		struct {
-			uint8_t *new_hash;
-			uint8_t *old_hash;
+			unsigned char new_hash[GIT_MAX_RAWSZ];
+			unsigned char old_hash[GIT_MAX_RAWSZ];
 			char *name;
 			char *email;
 			uint64_t time;
 			int16_t tz_offset;
 			char *message;
+			size_t message_cap;
 		} update;
 	} value;
 };
diff --git a/reftable/reftable-tests.h b/reftable/reftable-tests.h
index 0019cbcfa4..114cc3d053 100644
--- a/reftable/reftable-tests.h
+++ b/reftable/reftable-tests.h
@@ -14,7 +14,6 @@ int block_test_main(int argc, const char **argv);
 int merged_test_main(int argc, const char **argv);
 int pq_test_main(int argc, const char **argv);
 int record_test_main(int argc, const char **argv);
-int refname_test_main(int argc, const char **argv);
 int readwrite_test_main(int argc, const char **argv);
 int stack_test_main(int argc, const char **argv);
 int tree_test_main(int argc, const char **argv);
diff --git a/reftable/reftable-writer.h b/reftable/reftable-writer.h
index 7c7cae5f99..b601a69a40 100644
--- a/reftable/reftable-writer.h
+++ b/reftable/reftable-writer.h
@@ -38,14 +38,13 @@ struct reftable_write_options {
 	/* Default mode for creating files. If unset, use 0666 (+umask) */
 	unsigned int default_permissions;
 
-	/* boolean: do not check ref names for validity or dir/file conflicts.
-	 */
-	unsigned skip_name_check : 1;
-
 	/* boolean: copy log messages exactly. If unset, check that the message
 	 *   is a single line, and add '\n' if missing.
 	 */
 	unsigned exact_log_message : 1;
+
+	/* boolean: Prevent auto-compaction of tables. */
+	unsigned disable_auto_compact : 1;
 };
 
 /* reftable_block_stats holds statistics for a single block type */
diff --git a/reftable/stack.c b/reftable/stack.c
index b64e55648a..a59ebe038d 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -12,8 +12,8 @@ https://developers.google.com/open-source/licenses/bsd
 #include "system.h"
 #include "merged.h"
 #include "reader.h"
-#include "refname.h"
 #include "reftable-error.h"
+#include "reftable-generic.h"
 #include "reftable-record.h"
 #include "reftable-merged.h"
 #include "writer.h"
@@ -27,8 +27,6 @@ static int stack_write_compact(struct reftable_stack *st,
 			       struct reftable_writer *wr,
 			       size_t first, size_t last,
 			       struct reftable_log_expiry_config *config);
-static int stack_check_addition(struct reftable_stack *st,
-				const char *new_tab_name);
 static void reftable_addition_close(struct reftable_addition *add);
 static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 					     int reuse_open);
@@ -529,9 +527,9 @@ int reftable_stack_add(struct reftable_stack *st,
 {
 	int err = stack_try_add(st, write, arg);
 	if (err < 0) {
-		if (err == REFTABLE_LOCK_ERROR) {
+		if (err == REFTABLE_OUTDATED_ERROR) {
 			/* Ignore error return, we want to propagate
-			   REFTABLE_LOCK_ERROR.
+			   REFTABLE_OUTDATED_ERROR.
 			*/
 			reftable_stack_reload(st);
 		}
@@ -590,9 +588,8 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 	err = stack_uptodate(st);
 	if (err < 0)
 		goto done;
-
-	if (err > 1) {
-		err = REFTABLE_LOCK_ERROR;
+	if (err > 0) {
+		err = REFTABLE_OUTDATED_ERROR;
 		goto done;
 	}
 
@@ -681,8 +678,19 @@ int reftable_addition_commit(struct reftable_addition *add)
 	if (err)
 		goto done;
 
-	if (!add->stack->disable_auto_compact)
+	if (!add->stack->config.disable_auto_compact) {
+		/*
+		 * Auto-compact the stack to keep the number of tables in
+		 * control. It is possible that a concurrent writer is already
+		 * trying to compact parts of the stack, which would lead to a
+		 * `REFTABLE_LOCK_ERROR` because parts of the stack are locked
+		 * already. This is a benign error though, so we ignore it.
+		 */
 		err = reftable_stack_auto_compact(add->stack);
+		if (err < 0 && err != REFTABLE_LOCK_ERROR)
+			goto done;
+		err = 0;
+	}
 
 done:
 	reftable_addition_close(add);
@@ -713,10 +721,6 @@ static int stack_try_add(struct reftable_stack *st,
 	int err = reftable_stack_init_addition(&add, st);
 	if (err < 0)
 		goto done;
-	if (err > 0) {
-		err = REFTABLE_LOCK_ERROR;
-		goto done;
-	}
 
 	err = reftable_addition_add(&add, write_table, arg);
 	if (err < 0)
@@ -737,8 +741,9 @@ int reftable_addition_add(struct reftable_addition *add,
 	struct strbuf tab_file_name = STRBUF_INIT;
 	struct strbuf next_name = STRBUF_INIT;
 	struct reftable_writer *wr = NULL;
+	struct tempfile *tab_file = NULL;
 	int err = 0;
-	int tab_fd = 0;
+	int tab_fd;
 
 	strbuf_reset(&next_name);
 	format_name(&next_name, add->next_update_index, add->next_update_index);
@@ -746,17 +751,20 @@ int reftable_addition_add(struct reftable_addition *add,
 	stack_filename(&temp_tab_file_name, add->stack, next_name.buf);
 	strbuf_addstr(&temp_tab_file_name, ".temp.XXXXXX");
 
-	tab_fd = mkstemp(temp_tab_file_name.buf);
-	if (tab_fd < 0) {
+	tab_file = mks_tempfile(temp_tab_file_name.buf);
+	if (!tab_file) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 	if (add->stack->config.default_permissions) {
-		if (chmod(temp_tab_file_name.buf, add->stack->config.default_permissions)) {
+		if (chmod(get_tempfile_path(tab_file),
+			  add->stack->config.default_permissions)) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
+	tab_fd = get_tempfile_fd(tab_file);
+
 	wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush, &tab_fd,
 				 &add->stack->config);
 	err = write_table(wr, arg);
@@ -771,17 +779,12 @@ int reftable_addition_add(struct reftable_addition *add,
 	if (err < 0)
 		goto done;
 
-	err = close(tab_fd);
-	tab_fd = 0;
+	err = close_tempfile_gently(tab_file);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	err = stack_check_addition(add->stack, temp_tab_file_name.buf);
-	if (err < 0)
-		goto done;
-
 	if (wr->min_update_index < add->next_update_index) {
 		err = REFTABLE_API_ERROR;
 		goto done;
@@ -789,14 +792,13 @@ int reftable_addition_add(struct reftable_addition *add,
 
 	format_name(&next_name, wr->min_update_index, wr->max_update_index);
 	strbuf_addstr(&next_name, ".ref");
-
 	stack_filename(&tab_file_name, add->stack, next_name.buf);
 
 	/*
 	  On windows, this relies on rand() picking a unique destination name.
 	  Maybe we should do retry loop as well?
 	 */
-	err = rename(temp_tab_file_name.buf, tab_file_name.buf);
+	err = rename_tempfile(&tab_file, tab_file_name.buf);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
@@ -806,14 +808,7 @@ int reftable_addition_add(struct reftable_addition *add,
 			    add->new_tables_cap);
 	add->new_tables[add->new_tables_len++] = strbuf_detach(&next_name, NULL);
 done:
-	if (tab_fd > 0) {
-		close(tab_fd);
-		tab_fd = 0;
-	}
-	if (temp_tab_file_name.len > 0) {
-		unlink(temp_tab_file_name.buf);
-	}
-
+	delete_tempfile(&tab_file);
 	strbuf_release(&temp_tab_file_name);
 	strbuf_release(&tab_file_name);
 	strbuf_release(&next_name);
@@ -832,51 +827,56 @@ uint64_t reftable_stack_next_update_index(struct reftable_stack *st)
 
 static int stack_compact_locked(struct reftable_stack *st,
 				size_t first, size_t last,
-				struct strbuf *temp_tab,
-				struct reftable_log_expiry_config *config)
+				struct reftable_log_expiry_config *config,
+				struct tempfile **tab_file_out)
 {
 	struct strbuf next_name = STRBUF_INIT;
-	int tab_fd = -1;
+	struct strbuf tab_file_path = STRBUF_INIT;
 	struct reftable_writer *wr = NULL;
-	int err = 0;
+	struct tempfile *tab_file;
+	int tab_fd, err = 0;
 
 	format_name(&next_name,
 		    reftable_reader_min_update_index(st->readers[first]),
 		    reftable_reader_max_update_index(st->readers[last]));
+	stack_filename(&tab_file_path, st, next_name.buf);
+	strbuf_addstr(&tab_file_path, ".temp.XXXXXX");
 
-	stack_filename(temp_tab, st, next_name.buf);
-	strbuf_addstr(temp_tab, ".temp.XXXXXX");
+	tab_file = mks_tempfile(tab_file_path.buf);
+	if (!tab_file) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+	tab_fd = get_tempfile_fd(tab_file);
 
-	tab_fd = mkstemp(temp_tab->buf);
 	if (st->config.default_permissions &&
-	    chmod(temp_tab->buf, st->config.default_permissions) < 0) {
+	    chmod(get_tempfile_path(tab_file), st->config.default_permissions) < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush, &tab_fd, &st->config);
-
+	wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush,
+				 &tab_fd, &st->config);
 	err = stack_write_compact(st, wr, first, last, config);
 	if (err < 0)
 		goto done;
+
 	err = reftable_writer_close(wr);
 	if (err < 0)
 		goto done;
 
-	err = close(tab_fd);
-	tab_fd = 0;
+	err = close_tempfile_gently(tab_file);
+	if (err < 0)
+		goto done;
+
+	*tab_file_out = tab_file;
+	tab_file = NULL;
 
 done:
+	delete_tempfile(&tab_file);
 	reftable_writer_free(wr);
-	if (tab_fd > 0) {
-		close(tab_fd);
-		tab_fd = 0;
-	}
-	if (err != 0 && temp_tab->len > 0) {
-		unlink(temp_tab->buf);
-		strbuf_release(temp_tab);
-	}
 	strbuf_release(&next_name);
+	strbuf_release(&tab_file_path);
 	return err;
 }
 
@@ -978,217 +978,213 @@ done:
 	return err;
 }
 
-/* <  0: error. 0 == OK, > 0 attempt failed; could retry. */
+/*
+ * Compact all tables in the range `[first, last)` into a single new table.
+ *
+ * This function returns `0` on success or a code `< 0` on failure. When the
+ * stack or any of the tables in the specified range are already locked then
+ * this function returns `REFTABLE_LOCK_ERROR`. This is a benign error that
+ * callers can either ignore, or they may choose to retry compaction after some
+ * amount of time.
+ */
 static int stack_compact_range(struct reftable_stack *st,
 			       size_t first, size_t last,
 			       struct reftable_log_expiry_config *expiry)
 {
-	char **delete_on_success = NULL, **subtable_locks = NULL, **listp = NULL;
-	struct strbuf temp_tab_file_name = STRBUF_INIT;
+	struct strbuf tables_list_buf = STRBUF_INIT;
 	struct strbuf new_table_name = STRBUF_INIT;
-	struct strbuf lock_file_name = STRBUF_INIT;
-	struct strbuf ref_list_contents = STRBUF_INIT;
 	struct strbuf new_table_path = STRBUF_INIT;
-	size_t i, j, compact_count;
-	int err = 0;
-	int have_lock = 0;
-	int lock_file_fd = -1;
-	int is_empty_table = 0;
+	struct strbuf table_name = STRBUF_INIT;
+	struct lock_file tables_list_lock = LOCK_INIT;
+	struct lock_file *table_locks = NULL;
+	struct tempfile *new_table = NULL;
+	int is_empty_table = 0, err = 0;
+	size_t i;
 
 	if (first > last || (!expiry && first == last)) {
 		err = 0;
 		goto done;
 	}
 
-	compact_count = last - first + 1;
-	REFTABLE_CALLOC_ARRAY(delete_on_success, compact_count + 1);
-	REFTABLE_CALLOC_ARRAY(subtable_locks, compact_count + 1);
-
 	st->stats.attempts++;
 
-	strbuf_reset(&lock_file_name);
-	strbuf_addstr(&lock_file_name, st->list_file);
-	strbuf_addstr(&lock_file_name, ".lock");
-
-	lock_file_fd =
-		open(lock_file_name.buf, O_EXCL | O_CREAT | O_WRONLY, 0666);
-	if (lock_file_fd < 0) {
-		if (errno == EEXIST) {
-			err = 1;
-		} else {
+	/*
+	 * Hold the lock so that we can read "tables.list" and lock all tables
+	 * which are part of the user-specified range.
+	 */
+	err = hold_lock_file_for_update(&tables_list_lock, st->list_file,
+					LOCK_NO_DEREF);
+	if (err < 0) {
+		if (errno == EEXIST)
+			err = REFTABLE_LOCK_ERROR;
+		else
 			err = REFTABLE_IO_ERROR;
-		}
 		goto done;
 	}
-	/* Don't want to write to the lock for now.  */
-	close(lock_file_fd);
-	lock_file_fd = -1;
 
-	have_lock = 1;
 	err = stack_uptodate(st);
-	if (err != 0)
+	if (err)
 		goto done;
 
-	for (i = first, j = 0; i <= last; i++) {
-		struct strbuf subtab_file_name = STRBUF_INIT;
-		struct strbuf subtab_lock = STRBUF_INIT;
-		int sublock_file_fd = -1;
-
-		stack_filename(&subtab_file_name, st,
-			       reader_name(st->readers[i]));
-
-		strbuf_reset(&subtab_lock);
-		strbuf_addbuf(&subtab_lock, &subtab_file_name);
-		strbuf_addstr(&subtab_lock, ".lock");
-
-		sublock_file_fd = open(subtab_lock.buf,
-				       O_EXCL | O_CREAT | O_WRONLY, 0666);
-		if (sublock_file_fd >= 0) {
-			close(sublock_file_fd);
-		} else if (sublock_file_fd < 0) {
-			if (errno == EEXIST) {
-				err = 1;
-			} else {
+	/*
+	 * Lock all tables in the user-provided range. This is the slice of our
+	 * stack which we'll compact.
+	 */
+	REFTABLE_CALLOC_ARRAY(table_locks, last - first + 1);
+	for (i = first; i <= last; i++) {
+		stack_filename(&table_name, st, reader_name(st->readers[i]));
+
+		err = hold_lock_file_for_update(&table_locks[i - first],
+						table_name.buf, LOCK_NO_DEREF);
+		if (err < 0) {
+			if (errno == EEXIST)
+				err = REFTABLE_LOCK_ERROR;
+			else
 				err = REFTABLE_IO_ERROR;
-			}
+			goto done;
 		}
 
-		subtable_locks[j] = subtab_lock.buf;
-		delete_on_success[j] = subtab_file_name.buf;
-		j++;
-
-		if (err != 0)
+		/*
+		 * We need to close the lockfiles as we might otherwise easily
+		 * run into file descriptor exhaustion when we compress a lot
+		 * of tables.
+		 */
+		err = close_lock_file_gently(&table_locks[i - first]);
+		if (err < 0) {
+			err = REFTABLE_IO_ERROR;
 			goto done;
+		}
 	}
 
-	err = unlink(lock_file_name.buf);
-	if (err < 0)
+	/*
+	 * We have locked all tables in our range and can thus release the
+	 * "tables.list" lock while compacting the locked tables. This allows
+	 * concurrent updates to the stack to proceed.
+	 */
+	err = rollback_lock_file(&tables_list_lock);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
 		goto done;
-	have_lock = 0;
-
-	err = stack_compact_locked(st, first, last, &temp_tab_file_name,
-				   expiry);
-	/* Compaction + tombstones can create an empty table out of non-empty
-	 * tables. */
-	is_empty_table = (err == REFTABLE_EMPTY_TABLE_ERROR);
-	if (is_empty_table) {
-		err = 0;
 	}
-	if (err < 0)
-		goto done;
 
-	lock_file_fd =
-		open(lock_file_name.buf, O_EXCL | O_CREAT | O_WRONLY, 0666);
-	if (lock_file_fd < 0) {
-		if (errno == EEXIST) {
-			err = 1;
-		} else {
+	/*
+	 * Compact the now-locked tables into a new table. Note that compacting
+	 * these tables may end up with an empty new table in case tombstones
+	 * end up cancelling out all refs in that range.
+	 */
+	err = stack_compact_locked(st, first, last, expiry, &new_table);
+	if (err < 0) {
+		if (err != REFTABLE_EMPTY_TABLE_ERROR)
+			goto done;
+		is_empty_table = 1;
+	}
+
+	/*
+	 * Now that we have written the new, compacted table we need to re-lock
+	 * "tables.list". We'll then replace the compacted range of tables with
+	 * the new table.
+	 */
+	err = hold_lock_file_for_update(&tables_list_lock, st->list_file,
+					LOCK_NO_DEREF);
+	if (err < 0) {
+		if (errno == EEXIST)
+			err = REFTABLE_LOCK_ERROR;
+		else
 			err = REFTABLE_IO_ERROR;
-		}
 		goto done;
 	}
-	have_lock = 1;
+
 	if (st->config.default_permissions) {
-		if (chmod(lock_file_name.buf, st->config.default_permissions) < 0) {
+		if (chmod(get_lock_file_path(&tables_list_lock),
+			  st->config.default_permissions) < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
 
-	format_name(&new_table_name, st->readers[first]->min_update_index,
-		    st->readers[last]->max_update_index);
-	strbuf_addstr(&new_table_name, ".ref");
-
-	stack_filename(&new_table_path, st, new_table_name.buf);
-
+	/*
+	 * If the resulting compacted table is not empty, then we need to move
+	 * it into place now.
+	 */
 	if (!is_empty_table) {
-		/* retry? */
-		err = rename(temp_tab_file_name.buf, new_table_path.buf);
+		format_name(&new_table_name, st->readers[first]->min_update_index,
+			    st->readers[last]->max_update_index);
+		strbuf_addstr(&new_table_name, ".ref");
+		stack_filename(&new_table_path, st, new_table_name.buf);
+
+		err = rename_tempfile(&new_table, new_table_path.buf);
 		if (err < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
 
-	for (i = 0; i < first; i++) {
-		strbuf_addstr(&ref_list_contents, st->readers[i]->name);
-		strbuf_addstr(&ref_list_contents, "\n");
-	}
-	if (!is_empty_table) {
-		strbuf_addbuf(&ref_list_contents, &new_table_name);
-		strbuf_addstr(&ref_list_contents, "\n");
-	}
-	for (i = last + 1; i < st->merged->stack_len; i++) {
-		strbuf_addstr(&ref_list_contents, st->readers[i]->name);
-		strbuf_addstr(&ref_list_contents, "\n");
-	}
-
-	err = write_in_full(lock_file_fd, ref_list_contents.buf, ref_list_contents.len);
-	if (err < 0) {
-		err = REFTABLE_IO_ERROR;
-		unlink(new_table_path.buf);
-		goto done;
-	}
-
-	err = fsync_component(FSYNC_COMPONENT_REFERENCE, lock_file_fd);
+	/*
+	 * Write the new "tables.list" contents with the compacted table we
+	 * have just written. In case the compacted table became empty we
+	 * simply skip writing it.
+	 */
+	for (i = 0; i < first; i++)
+		strbuf_addf(&tables_list_buf, "%s\n", st->readers[i]->name);
+	if (!is_empty_table)
+		strbuf_addf(&tables_list_buf, "%s\n", new_table_name.buf);
+	for (i = last + 1; i < st->merged->stack_len; i++)
+		strbuf_addf(&tables_list_buf, "%s\n", st->readers[i]->name);
+
+	err = write_in_full(get_lock_file_fd(&tables_list_lock),
+			    tables_list_buf.buf, tables_list_buf.len);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
 
-	err = close(lock_file_fd);
-	lock_file_fd = -1;
+	err = fsync_component(FSYNC_COMPONENT_REFERENCE, get_lock_file_fd(&tables_list_lock));
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
 
-	err = rename(lock_file_name.buf, st->list_file);
+	err = commit_lock_file(&tables_list_lock);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
-	have_lock = 0;
 
-	/* Reload the stack before deleting. On windows, we can only delete the
-	   files after we closed them.
-	*/
+	/*
+	 * Reload the stack before deleting the compacted tables. We can only
+	 * delete the files after we closed them on Windows, so this needs to
+	 * happen first.
+	 */
 	err = reftable_stack_reload_maybe_reuse(st, first < last);
+	if (err < 0)
+		goto done;
 
-	listp = delete_on_success;
-	while (*listp) {
-		if (strcmp(*listp, new_table_path.buf)) {
-			unlink(*listp);
-		}
-		listp++;
+	/*
+	 * Delete the old tables. They may still be in use by concurrent
+	 * readers, so it is expected that unlinking tables may fail.
+	 */
+	for (i = first; i <= last; i++) {
+		struct lock_file *table_lock = &table_locks[i - first];
+		char *table_path = get_locked_file_path(table_lock);
+		unlink(table_path);
+		free(table_path);
 	}
 
 done:
-	free_names(delete_on_success);
+	rollback_lock_file(&tables_list_lock);
+	for (i = first; table_locks && i <= last; i++)
+		rollback_lock_file(&table_locks[i - first]);
+	reftable_free(table_locks);
 
-	if (subtable_locks) {
-		listp = subtable_locks;
-		while (*listp) {
-			unlink(*listp);
-			listp++;
-		}
-		free_names(subtable_locks);
-	}
-	if (lock_file_fd >= 0) {
-		close(lock_file_fd);
-		lock_file_fd = -1;
-	}
-	if (have_lock) {
-		unlink(lock_file_name.buf);
-	}
+	delete_tempfile(&new_table);
 	strbuf_release(&new_table_name);
 	strbuf_release(&new_table_path);
-	strbuf_release(&ref_list_contents);
-	strbuf_release(&temp_tab_file_name);
-	strbuf_release(&lock_file_name);
+
+	strbuf_release(&tables_list_buf);
+	strbuf_release(&table_name);
 	return err;
 }
 
@@ -1204,7 +1200,7 @@ static int stack_compact_range_stats(struct reftable_stack *st,
 				     struct reftable_log_expiry_config *config)
 {
 	int err = stack_compact_range(st, first, last, config);
-	if (err > 0)
+	if (err == REFTABLE_LOCK_ERROR)
 		st->stats.failures++;
 	return err;
 }
@@ -1214,75 +1210,76 @@ static int segment_size(struct segment *s)
 	return s->end - s->start;
 }
 
-int fastlog2(uint64_t sz)
-{
-	int l = 0;
-	if (sz == 0)
-		return 0;
-	for (; sz; sz /= 2) {
-		l++;
-	}
-	return l - 1;
-}
-
-struct segment *sizes_to_segments(size_t *seglen, uint64_t *sizes, size_t n)
-{
-	struct segment *segs = reftable_calloc(n, sizeof(*segs));
-	struct segment cur = { 0 };
-	size_t next = 0, i;
-
-	if (n == 0) {
-		*seglen = 0;
-		return segs;
-	}
-	for (i = 0; i < n; i++) {
-		int log = fastlog2(sizes[i]);
-		if (cur.log != log && cur.bytes > 0) {
-			struct segment fresh = {
-				.start = i,
-			};
-
-			segs[next++] = cur;
-			cur = fresh;
-		}
-
-		cur.log = log;
-		cur.end = i + 1;
-		cur.bytes += sizes[i];
-	}
-	segs[next++] = cur;
-	*seglen = next;
-	return segs;
-}
-
 struct segment suggest_compaction_segment(uint64_t *sizes, size_t n)
 {
-	struct segment min_seg = {
-		.log = 64,
-	};
-	struct segment *segs;
-	size_t seglen = 0, i;
-
-	segs = sizes_to_segments(&seglen, sizes, n);
-	for (i = 0; i < seglen; i++) {
-		if (segment_size(&segs[i]) == 1)
-			continue;
+	struct segment seg = { 0 };
+	uint64_t bytes;
+	size_t i;
 
-		if (segs[i].log < min_seg.log)
-			min_seg = segs[i];
-	}
+	/*
+	 * If there are no tables or only a single one then we don't have to
+	 * compact anything. The sequence is geometric by definition already.
+	 */
+	if (n <= 1)
+		return seg;
 
-	while (min_seg.start > 0) {
-		size_t prev = min_seg.start - 1;
-		if (fastlog2(min_seg.bytes) < fastlog2(sizes[prev]))
+	/*
+	 * Find the ending table of the compaction segment needed to restore the
+	 * geometric sequence. Note that the segment end is exclusive.
+	 *
+	 * To do so, we iterate backwards starting from the most recent table
+	 * until a valid segment end is found. If the preceding table is smaller
+	 * than the current table multiplied by the geometric factor (2), the
+	 * compaction segment end has been identified.
+	 *
+	 * Tables after the ending point are not added to the byte count because
+	 * they are already valid members of the geometric sequence. Due to the
+	 * properties of a geometric sequence, it is not possible for the sum of
+	 * these tables to exceed the value of the ending point table.
+	 *
+	 * Example table size sequence requiring no compaction:
+	 * 	64, 32, 16, 8, 4, 2, 1
+	 *
+	 * Example table size sequence where compaction segment end is set to
+	 * the last table. Since the segment end is exclusive, the last table is
+	 * excluded during subsequent compaction and the table with size 3 is
+	 * the final table included:
+	 * 	64, 32, 16, 8, 4, 3, 1
+	 */
+	for (i = n - 1; i > 0; i--) {
+		if (sizes[i - 1] < sizes[i] * 2) {
+			seg.end = i + 1;
+			bytes = sizes[i];
 			break;
+		}
+	}
 
-		min_seg.start = prev;
-		min_seg.bytes += sizes[prev];
+	/*
+	 * Find the starting table of the compaction segment by iterating
+	 * through the remaining tables and keeping track of the accumulated
+	 * size of all tables seen from the segment end table. The previous
+	 * table is compared to the accumulated size because the tables from the
+	 * segment end are merged backwards recursively.
+	 *
+	 * Note that we keep iterating even after we have found the first
+	 * starting point. This is because there may be tables in the stack
+	 * preceding that first starting point which violate the geometric
+	 * sequence.
+	 *
+	 * Example compaction segment start set to table with size 32:
+	 * 	128, 32, 16, 8, 4, 3, 1
+	 */
+	for (; i > 0; i--) {
+		uint64_t curr = bytes;
+		bytes += sizes[i - 1];
+
+		if (sizes[i - 1] < curr * 2) {
+			seg.start = i - 1;
+			seg.bytes = bytes;
+		}
 	}
 
-	reftable_free(segs);
-	return min_seg;
+	return seg;
 }
 
 static uint64_t *stack_table_sizes_for_compaction(struct reftable_stack *st)
@@ -1352,65 +1349,6 @@ done:
 	return err;
 }
 
-static int stack_check_addition(struct reftable_stack *st,
-				const char *new_tab_name)
-{
-	int err = 0;
-	struct reftable_block_source src = { NULL };
-	struct reftable_reader *rd = NULL;
-	struct reftable_table tab = { NULL };
-	struct reftable_ref_record *refs = NULL;
-	struct reftable_iterator it = { NULL };
-	int cap = 0;
-	int len = 0;
-	int i = 0;
-
-	if (st->config.skip_name_check)
-		return 0;
-
-	err = reftable_block_source_from_file(&src, new_tab_name);
-	if (err < 0)
-		goto done;
-
-	err = reftable_new_reader(&rd, &src, new_tab_name);
-	if (err < 0)
-		goto done;
-
-	err = reftable_reader_seek_ref(rd, &it, "");
-	if (err > 0) {
-		err = 0;
-		goto done;
-	}
-	if (err < 0)
-		goto done;
-
-	while (1) {
-		struct reftable_ref_record ref = { NULL };
-		err = reftable_iterator_next_ref(&it, &ref);
-		if (err > 0)
-			break;
-		if (err < 0)
-			goto done;
-
-		REFTABLE_ALLOC_GROW(refs, len + 1, cap);
-		refs[len++] = ref;
-	}
-
-	reftable_table_from_merged_table(&tab, reftable_stack_merged_table(st));
-
-	err = validate_ref_record_addition(tab, refs, len);
-
-done:
-	for (i = 0; i < len; i++) {
-		reftable_ref_record_release(&refs[i]);
-	}
-
-	free(refs);
-	reftable_iterator_destroy(&it);
-	reftable_reader_free(rd);
-	return err;
-}
-
 static int is_table_name(const char *s)
 {
 	const char *dot = strrchr(s, '.');
diff --git a/reftable/stack.h b/reftable/stack.h
index d919455669..d43efa4760 100644
--- a/reftable/stack.h
+++ b/reftable/stack.h
@@ -19,7 +19,6 @@ struct reftable_stack {
 	int list_fd;
 
 	char *reftable_dir;
-	int disable_auto_compact;
 
 	struct reftable_write_options config;
 
@@ -33,12 +32,9 @@ int read_lines(const char *filename, char ***lines);
 
 struct segment {
 	size_t start, end;
-	int log;
 	uint64_t bytes;
 };
 
-int fastlog2(uint64_t sz);
-struct segment *sizes_to_segments(size_t *seglen, uint64_t *sizes, size_t n);
 struct segment suggest_compaction_segment(uint64_t *sizes, size_t n);
 
 #endif
diff --git a/reftable/stack_test.c b/reftable/stack_test.c
index 509f486623..7889f818d1 100644
--- a/reftable/stack_test.c
+++ b/reftable/stack_test.c
@@ -38,7 +38,17 @@ static int count_dir_entries(const char *dirname)
 		return 0;
 
 	while ((d = readdir(dir))) {
-		if (!strcmp(d->d_name, "..") || !strcmp(d->d_name, "."))
+		/*
+		 * Besides skipping over "." and "..", we also need to
+		 * skip over other files that have a leading ".". This
+		 * is due to behaviour of NFS, which will rename files
+		 * to ".nfs*" to emulate delete-on-last-close.
+		 *
+		 * In any case this should be fine as the reftable
+		 * library will never write files with leading dots
+		 * anyway.
+		 */
+		if (starts_with(d->d_name, "."))
 			continue;
 		len++;
 	}
@@ -232,7 +242,7 @@ static void test_reftable_stack_uptodate(void)
 	EXPECT_ERR(err);
 
 	err = reftable_stack_add(st2, &write_test_ref, &ref2);
-	EXPECT(err == REFTABLE_LOCK_ERROR);
+	EXPECT(err == REFTABLE_OUTDATED_ERROR);
 
 	err = reftable_stack_reload(st2);
 	EXPECT_ERR(err);
@@ -315,7 +325,7 @@ static void test_reftable_stack_transaction_api_performs_auto_compaction(void)
 		 * we can ensure that we indeed honor this setting and have
 		 * better control over when exactly auto compaction runs.
 		 */
-		st->disable_auto_compact = i != n;
+		st->config.disable_auto_compact = i != n;
 
 		err = reftable_stack_new_addition(&add, st);
 		EXPECT_ERR(err);
@@ -343,41 +353,46 @@ static void test_reftable_stack_transaction_api_performs_auto_compaction(void)
 	clear_dir(dir);
 }
 
-static void test_reftable_stack_validate_refname(void)
+static void test_reftable_stack_auto_compaction_fails_gracefully(void)
 {
-	struct reftable_write_options cfg = { 0 };
-	struct reftable_stack *st = NULL;
-	int err;
-	char *dir = get_tmp_dir(__LINE__);
-
-	int i;
 	struct reftable_ref_record ref = {
-		.refname = "a/b",
+		.refname = "refs/heads/master",
 		.update_index = 1,
-		.value_type = REFTABLE_REF_SYMREF,
-		.value.symref = "master",
+		.value_type = REFTABLE_REF_VAL1,
+		.value.val1 = {0x01},
 	};
-	char *additions[] = { "a", "a/b/c" };
+	struct reftable_write_options cfg = {0};
+	struct reftable_stack *st;
+	struct strbuf table_path = STRBUF_INIT;
+	char *dir = get_tmp_dir(__LINE__);
+	int err;
 
 	err = reftable_new_stack(&st, dir, cfg);
 	EXPECT_ERR(err);
 
-	err = reftable_stack_add(st, &write_test_ref, &ref);
+	err = reftable_stack_add(st, write_test_ref, &ref);
 	EXPECT_ERR(err);
+	EXPECT(st->merged->stack_len == 1);
+	EXPECT(st->stats.attempts == 0);
+	EXPECT(st->stats.failures == 0);
+
+	/*
+	 * Lock the newly written table such that it cannot be compacted.
+	 * Adding a new table to the stack should not be impacted by this, even
+	 * though auto-compaction will now fail.
+	 */
+	strbuf_addf(&table_path, "%s/%s.lock", dir, st->readers[0]->name);
+	write_file_buf(table_path.buf, "", 0);
 
-	for (i = 0; i < ARRAY_SIZE(additions); i++) {
-		struct reftable_ref_record ref = {
-			.refname = additions[i],
-			.update_index = 1,
-			.value_type = REFTABLE_REF_SYMREF,
-			.value.symref = "master",
-		};
-
-		err = reftable_stack_add(st, &write_test_ref, &ref);
-		EXPECT(err == REFTABLE_NAME_CONFLICT);
-	}
+	ref.update_index = 2;
+	err = reftable_stack_add(st, write_test_ref, &ref);
+	EXPECT_ERR(err);
+	EXPECT(st->merged->stack_len == 2);
+	EXPECT(st->stats.attempts == 1);
+	EXPECT(st->stats.failures == 1);
 
 	reftable_stack_destroy(st);
+	strbuf_release(&table_path);
 	clear_dir(dir);
 }
 
@@ -444,6 +459,7 @@ static void test_reftable_stack_add(void)
 	struct reftable_write_options cfg = {
 		.exact_log_message = 1,
 		.default_permissions = 0660,
+		.disable_auto_compact = 1,
 	};
 	struct reftable_stack *st = NULL;
 	char *dir = get_tmp_dir(__LINE__);
@@ -455,7 +471,6 @@ static void test_reftable_stack_add(void)
 
 	err = reftable_new_stack(&st, dir, cfg);
 	EXPECT_ERR(err);
-	st->disable_auto_compact = 1;
 
 	for (i = 0; i < N; i++) {
 		char buf[256];
@@ -468,8 +483,6 @@ static void test_reftable_stack_add(void)
 		logs[i].refname = xstrdup(buf);
 		logs[i].update_index = N + i + 1;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
-
-		logs[i].value.update.new_hash = reftable_malloc(GIT_SHA1_RAWSZ);
 		logs[i].value.update.email = xstrdup("identity@invalid");
 		set_test_hash(logs[i].value.update.new_hash, i);
 	}
@@ -547,16 +560,17 @@ static void test_reftable_stack_log_normalize(void)
 	};
 	struct reftable_stack *st = NULL;
 	char *dir = get_tmp_dir(__LINE__);
-
-	uint8_t h1[GIT_SHA1_RAWSZ] = { 0x01 }, h2[GIT_SHA1_RAWSZ] = { 0x02 };
-
-	struct reftable_log_record input = { .refname = "branch",
-					     .update_index = 1,
-					     .value_type = REFTABLE_LOG_UPDATE,
-					     .value = { .update = {
-								.new_hash = h1,
-								.old_hash = h2,
-							} } };
+	struct reftable_log_record input = {
+		.refname = "branch",
+		.update_index = 1,
+		.value_type = REFTABLE_LOG_UPDATE,
+		.value = {
+			.update = {
+				.new_hash = { 1 },
+				.old_hash = { 2 },
+			},
+		},
+	};
 	struct reftable_log_record dest = {
 		.update_index = 0,
 	};
@@ -627,8 +641,6 @@ static void test_reftable_stack_tombstone(void)
 		logs[i].update_index = 42;
 		if (i % 2 == 0) {
 			logs[i].value_type = REFTABLE_LOG_UPDATE;
-			logs[i].value.update.new_hash =
-				reftable_malloc(GIT_SHA1_RAWSZ);
 			set_test_hash(logs[i].value.update.new_hash, i);
 			logs[i].value.update.email =
 				xstrdup("identity@invalid");
@@ -720,59 +732,13 @@ static void test_reftable_stack_hash_id(void)
 	clear_dir(dir);
 }
 
-static void test_log2(void)
-{
-	EXPECT(1 == fastlog2(3));
-	EXPECT(2 == fastlog2(4));
-	EXPECT(2 == fastlog2(5));
-}
-
-static void test_sizes_to_segments(void)
-{
-	uint64_t sizes[] = { 2, 3, 4, 5, 7, 9 };
-	/* .................0  1  2  3  4  5 */
-
-	size_t seglen = 0;
-	struct segment *segs =
-		sizes_to_segments(&seglen, sizes, ARRAY_SIZE(sizes));
-	EXPECT(segs[2].log == 3);
-	EXPECT(segs[2].start == 5);
-	EXPECT(segs[2].end == 6);
-
-	EXPECT(segs[1].log == 2);
-	EXPECT(segs[1].start == 2);
-	EXPECT(segs[1].end == 5);
-	reftable_free(segs);
-}
-
-static void test_sizes_to_segments_empty(void)
-{
-	size_t seglen = 0;
-	struct segment *segs = sizes_to_segments(&seglen, NULL, 0);
-	EXPECT(seglen == 0);
-	reftable_free(segs);
-}
-
-static void test_sizes_to_segments_all_equal(void)
-{
-	uint64_t sizes[] = { 5, 5 };
-	size_t seglen = 0;
-	struct segment *segs =
-		sizes_to_segments(&seglen, sizes, ARRAY_SIZE(sizes));
-	EXPECT(seglen == 1);
-	EXPECT(segs[0].start == 0);
-	EXPECT(segs[0].end == 2);
-	reftable_free(segs);
-}
-
 static void test_suggest_compaction_segment(void)
 {
-	uint64_t sizes[] = { 128, 64, 17, 16, 9, 9, 9, 16, 16 };
-	/* .................0    1    2  3   4  5  6 */
+	uint64_t sizes[] = { 512, 64, 17, 16, 9, 9, 9, 16, 2, 16 };
 	struct segment min =
 		suggest_compaction_segment(sizes, ARRAY_SIZE(sizes));
-	EXPECT(min.start == 2);
-	EXPECT(min.end == 7);
+	EXPECT(min.start == 1);
+	EXPECT(min.end == 10);
 }
 
 static void test_suggest_compaction_segment_nothing(void)
@@ -810,7 +776,6 @@ static void test_reflog_expire(void)
 		logs[i].update_index = i;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
 		logs[i].value.update.time = i;
-		logs[i].value.update.new_hash = reftable_malloc(GIT_SHA1_RAWSZ);
 		logs[i].value.update.email = xstrdup("identity@invalid");
 		set_test_hash(logs[i].value.update.new_hash, i);
 	}
@@ -884,9 +849,21 @@ static void test_empty_add(void)
 	reftable_stack_destroy(st2);
 }
 
+static int fastlog2(uint64_t sz)
+{
+	int l = 0;
+	if (sz == 0)
+		return 0;
+	for (; sz; sz /= 2)
+		l++;
+	return l - 1;
+}
+
 static void test_reftable_stack_auto_compaction(void)
 {
-	struct reftable_write_options cfg = { 0 };
+	struct reftable_write_options cfg = {
+		.disable_auto_compact = 1,
+	};
 	struct reftable_stack *st = NULL;
 	char *dir = get_tmp_dir(__LINE__);
 
@@ -896,7 +873,6 @@ static void test_reftable_stack_auto_compaction(void)
 	err = reftable_new_stack(&st, dir, cfg);
 	EXPECT_ERR(err);
 
-	st->disable_auto_compact = 1; /* call manually below for coverage. */
 	for (i = 0; i < N; i++) {
 		char name[100];
 		struct reftable_ref_record ref = {
@@ -945,7 +921,7 @@ static void test_reftable_stack_add_performs_auto_compaction(void)
 		 * we can ensure that we indeed honor this setting and have
 		 * better control over when exactly auto compaction runs.
 		 */
-		st->disable_auto_compact = i != n;
+		st->config.disable_auto_compact = i != n;
 
 		strbuf_reset(&refname);
 		strbuf_addf(&refname, "branch-%04d", i);
@@ -1072,7 +1048,6 @@ static void test_reftable_stack_compaction_concurrent_clean(void)
 int stack_test_main(int argc, const char *argv[])
 {
 	RUN_TEST(test_empty_add);
-	RUN_TEST(test_log2);
 	RUN_TEST(test_names_equal);
 	RUN_TEST(test_parse_names);
 	RUN_TEST(test_read_file);
@@ -1089,12 +1064,9 @@ int stack_test_main(int argc, const char *argv[])
 	RUN_TEST(test_reftable_stack_tombstone);
 	RUN_TEST(test_reftable_stack_transaction_api);
 	RUN_TEST(test_reftable_stack_transaction_api_performs_auto_compaction);
+	RUN_TEST(test_reftable_stack_auto_compaction_fails_gracefully);
 	RUN_TEST(test_reftable_stack_update_index_check);
 	RUN_TEST(test_reftable_stack_uptodate);
-	RUN_TEST(test_reftable_stack_validate_refname);
-	RUN_TEST(test_sizes_to_segments);
-	RUN_TEST(test_sizes_to_segments_all_equal);
-	RUN_TEST(test_sizes_to_segments_empty);
 	RUN_TEST(test_suggest_compaction_segment);
 	RUN_TEST(test_suggest_compaction_segment_nothing);
 	return 0;
diff --git a/reftable/system.h b/reftable/system.h
index 6b74a81514..5d8b6dede5 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -12,7 +12,9 @@ https://developers.google.com/open-source/licenses/bsd
 /* This header glues the reftable library to the rest of Git */
 
 #include "git-compat-util.h"
+#include "lockfile.h"
 #include "strbuf.h"
+#include "tempfile.h"
 #include "hash-ll.h" /* hash ID, sizes.*/
 #include "dir.h" /* remove_dir_recursively, for tests.*/
 
diff --git a/reftable/writer.c b/reftable/writer.c
index 1d9ff0fbfa..10eccaaa07 100644
--- a/reftable/writer.c
+++ b/reftable/writer.c
@@ -109,7 +109,7 @@ static void writer_reinit_block_writer(struct reftable_writer *w, uint8_t typ)
 		block_start = header_size(writer_version(w));
 	}
 
-	strbuf_release(&w->last_key);
+	strbuf_reset(&w->last_key);
 	block_writer_init(&w->block_writer_data, typ, w->block,
 			  w->opts.block_size, block_start,
 			  hash_size(w->opts.hash_id));
@@ -149,11 +149,21 @@ void reftable_writer_set_limits(struct reftable_writer *w, uint64_t min,
 	w->max_update_index = max;
 }
 
+static void writer_release(struct reftable_writer *w)
+{
+	if (w) {
+		reftable_free(w->block);
+		w->block = NULL;
+		block_writer_release(&w->block_writer_data);
+		w->block_writer = NULL;
+		writer_clear_index(w);
+		strbuf_release(&w->last_key);
+	}
+}
+
 void reftable_writer_free(struct reftable_writer *w)
 {
-	if (!w)
-		return;
-	reftable_free(w->block);
+	writer_release(w);
 	reftable_free(w);
 }
 
@@ -209,7 +219,8 @@ static int writer_add_record(struct reftable_writer *w,
 			     struct reftable_record *rec)
 {
 	struct strbuf key = STRBUF_INIT;
-	int err = -1;
+	int err;
+
 	reftable_record_key(rec, &key);
 	if (strbuf_cmp(&w->last_key, &key) >= 0) {
 		err = REFTABLE_API_ERROR;
@@ -218,27 +229,42 @@ static int writer_add_record(struct reftable_writer *w,
 
 	strbuf_reset(&w->last_key);
 	strbuf_addbuf(&w->last_key, &key);
-	if (!w->block_writer) {
+	if (!w->block_writer)
 		writer_reinit_block_writer(w, reftable_record_type(rec));
-	}
 
-	assert(block_writer_type(w->block_writer) == reftable_record_type(rec));
+	if (block_writer_type(w->block_writer) != reftable_record_type(rec))
+		BUG("record of type %d added to writer of type %d",
+		    reftable_record_type(rec), block_writer_type(w->block_writer));
 
-	if (block_writer_add(w->block_writer, rec) == 0) {
+	/*
+	 * Try to add the record to the writer. If this succeeds then we're
+	 * done. Otherwise the block writer may have hit the block size limit
+	 * and needs to be flushed.
+	 */
+	if (!block_writer_add(w->block_writer, rec)) {
 		err = 0;
 		goto done;
 	}
 
+	/*
+	 * The current block is full, so we need to flush and reinitialize the
+	 * writer to start writing the next block.
+	 */
 	err = writer_flush_block(w);
-	if (err < 0) {
+	if (err < 0)
 		goto done;
-	}
-
 	writer_reinit_block_writer(w, reftable_record_type(rec));
+
+	/*
+	 * Try to add the record to the writer again. If this still fails then
+	 * the record does not fit into the block size.
+	 *
+	 * TODO: it would be great to have `block_writer_add()` return proper
+	 *       error codes so that we don't have to second-guess the failure
+	 *       mode here.
+	 */
 	err = block_writer_add(w->block_writer, rec);
-	if (err == -1) {
-		/* we are writing into memory, so an error can only mean it
-		 * doesn't fit. */
+	if (err) {
 		err = REFTABLE_ENTRY_TOO_BIG_ERROR;
 		goto done;
 	}
@@ -452,7 +478,7 @@ static int writer_finish_section(struct reftable_writer *w)
 	bstats->max_index_level = max_level;
 
 	/* Reinit lastKey, as the next section can start with any key. */
-	w->last_key.len = 0;
+	strbuf_reset(&w->last_key);
 
 	return 0;
 }
@@ -627,74 +653,87 @@ int reftable_writer_close(struct reftable_writer *w)
 	}
 
 done:
-	/* free up memory. */
-	block_writer_release(&w->block_writer_data);
-	writer_clear_index(w);
-	strbuf_release(&w->last_key);
+	writer_release(w);
 	return err;
 }
 
 static void writer_clear_index(struct reftable_writer *w)
 {
-	for (size_t i = 0; i < w->index_len; i++)
+	for (size_t i = 0; w->index && i < w->index_len; i++)
 		strbuf_release(&w->index[i].last_key);
 	FREE_AND_NULL(w->index);
 	w->index_len = 0;
 	w->index_cap = 0;
 }
 
-static const int debug = 0;
-
 static int writer_flush_nonempty_block(struct reftable_writer *w)
 {
+	struct reftable_index_record index_record = {
+		.last_key = STRBUF_INIT,
+	};
 	uint8_t typ = block_writer_type(w->block_writer);
-	struct reftable_block_stats *bstats =
-		writer_reftable_block_stats(w, typ);
-	uint64_t block_typ_off = (bstats->blocks == 0) ? w->next : 0;
-	int raw_bytes = block_writer_finish(w->block_writer);
-	int padding = 0;
-	int err = 0;
-	struct reftable_index_record ir = { .last_key = STRBUF_INIT };
+	struct reftable_block_stats *bstats;
+	int raw_bytes, padding = 0, err;
+	uint64_t block_typ_off;
+
+	/*
+	 * Finish the current block. This will cause the block writer to emit
+	 * restart points and potentially compress records in case we are
+	 * writing a log block.
+	 *
+	 * Note that this is still happening in memory.
+	 */
+	raw_bytes = block_writer_finish(w->block_writer);
 	if (raw_bytes < 0)
 		return raw_bytes;
 
-	if (!w->opts.unpadded && typ != BLOCK_TYPE_LOG) {
+	/*
+	 * By default, all records except for log records are padded to the
+	 * block size.
+	 */
+	if (!w->opts.unpadded && typ != BLOCK_TYPE_LOG)
 		padding = w->opts.block_size - raw_bytes;
-	}
 
-	if (block_typ_off > 0) {
+	bstats = writer_reftable_block_stats(w, typ);
+	block_typ_off = (bstats->blocks == 0) ? w->next : 0;
+	if (block_typ_off > 0)
 		bstats->offset = block_typ_off;
-	}
-
 	bstats->entries += w->block_writer->entries;
 	bstats->restarts += w->block_writer->restart_len;
 	bstats->blocks++;
 	w->stats.blocks++;
 
-	if (debug) {
-		fprintf(stderr, "block %c off %" PRIu64 " sz %d (%d)\n", typ,
-			w->next, raw_bytes,
-			get_be24(w->block + w->block_writer->header_off + 1));
-	}
-
-	if (w->next == 0) {
+	/*
+	 * If this is the first block we're writing to the table then we need
+	 * to also write the reftable header.
+	 */
+	if (!w->next)
 		writer_write_header(w, w->block);
-	}
 
 	err = padded_write(w, w->block, raw_bytes, padding);
 	if (err < 0)
 		return err;
 
+	/*
+	 * Add an index record for every block that we're writing. If we end up
+	 * having more than a threshold of index records we will end up writing
+	 * an index section in `writer_finish_section()`. Each index record
+	 * contains the last record key of the block it is indexing as well as
+	 * the offset of that block.
+	 *
+	 * Note that this also applies when flushing index blocks, in which
+	 * case we will end up with a multi-level index.
+	 */
 	REFTABLE_ALLOC_GROW(w->index, w->index_len + 1, w->index_cap);
-
-	ir.offset = w->next;
-	strbuf_reset(&ir.last_key);
-	strbuf_addbuf(&ir.last_key, &w->block_writer->last_key);
-	w->index[w->index_len] = ir;
-
+	index_record.offset = w->next;
+	strbuf_reset(&index_record.last_key);
+	strbuf_addbuf(&index_record.last_key, &w->block_writer->last_key);
+	w->index[w->index_len] = index_record;
 	w->index_len++;
+
 	w->next += padding + raw_bytes;
 	w->block_writer = NULL;
+
 	return 0;
 }