diff options
| author | Mathieu Bordere <mathieu@letmetweakit.com> | 2025-09-24 12:41:06 +0200 |
|---|---|---|
| committer | Pádraig Brady <P@draigBrady.com> | 2025-09-30 14:09:37 +0100 |
| commit | 67e9068c5f5fdae5666279717a4c19bdfe5c21de (patch) | |
| tree | 3433b931659ca653d12b7d137e9f76cfdb454da2 /src | |
| parent | maint: update valgrind instructions (diff) | |
| download | coreutils-67e9068c5f5fdae5666279717a4c19bdfe5c21de.tar.gz coreutils-67e9068c5f5fdae5666279717a4c19bdfe5c21de.zip | |
wc: add AVX512 function for line counting
* configure.ac: Add detection of AVX512 intrinsics for wc.
* src/local.mk: Build AVX512 wc libraries.
* src/wc.c: Add runtime detection of AVX512 intrinsics and call
appropriate function when detected.
* src/wc.h (wc_lines_avx512): Declare function.
* tests/wc/wc-cpu.sh: Add a test that disables AVX512 intrinsics.
* src/wc_avx512.c: New file containing the wc -l implementation using
AVX512. The logic and code is reused from the AVX2 implementation with
slight adaptations. Replaced __builtin_popcount by __builtin_popcountll
and the combination of _mm256_cmpeq_epi8 and _mm256_movemask_epi8 by a
single call to _mm512_cmpeq_epi8_mask.
* NEWS: Mention the improvement.
Diffstat (limited to 'src')
| -rw-r--r-- | src/local.mk | 7 | ||||
| -rw-r--r-- | src/wc.c | 30 | ||||
| -rw-r--r-- | src/wc.h | 1 | ||||
| -rw-r--r-- | src/wc_avx512.c | 58 |
4 files changed, 92 insertions, 4 deletions
diff --git a/src/local.mk b/src/local.mk index f8a4bcffb..a55c9f990 100644 --- a/src/local.mk +++ b/src/local.mk @@ -484,6 +484,13 @@ src_expand_SOURCES = src/expand.c src/expand-common.c src_unexpand_SOURCES = src/unexpand.c src/expand-common.c src_wc_SOURCES = src/wc.c +if USE_AVX512_WC_LINECOUNT +noinst_LIBRARIES += src/libwc_avx512.a +src_libwc_avx512_a_SOURCES = src/wc_avx512.c +wc_avx512_ldadd = src/libwc_avx512.a +src_wc_LDADD += $(wc_avx512_ldadd) +src_libwc_avx512_a_CFLAGS = -mavx512bw -mavx512f $(AM_CFLAGS) +endif if USE_AVX2_WC_LINECOUNT noinst_LIBRARIES += src/libwc_avx2.a src_libwc_avx2_a_SOURCES = src/wc_avx2.c @@ -134,14 +134,29 @@ static enum total_type total_mode = total_auto; static bool avx2_supported (void) { - bool avx_enabled = cpu_supports ("avx2"); - + bool avx2_enabled = cpu_supports ("avx2"); if (debug) - error (0, 0, (avx_enabled + error (0, 0, (avx2_enabled ? _("using avx2 hardware support") : _("avx2 support not detected"))); - return avx_enabled; + return avx2_enabled; +} +#endif + +#ifdef USE_AVX512_WC_LINECOUNT +static bool +avx512_supported (void) +{ + bool avx512_enabled = (cpu_supports ("avx512f") + && cpu_supports ("avx512bw")); + + if (debug) + error (0, 0, (avx512_enabled + ? _("using avx512 hardware support") + : _("avx512 support not detected"))); + + return avx512_enabled; } #endif @@ -246,6 +261,13 @@ write_counts (uintmax_t lines, static struct wc_lines wc_lines (int fd) { +#ifdef USE_AVX512_WC_LINECOUNT + static signed char use_avx512; + if (!use_avx512) + use_avx512 = avx512_supported () ? 1 : -1; + if (0 < use_avx512) + return wc_lines_avx512 (fd); +#endif #ifdef USE_AVX2_WC_LINECOUNT static signed char use_avx2; if (!use_avx2) @@ -1,3 +1,4 @@ #include <stdint.h> struct wc_lines { int err; intmax_t lines; intmax_t bytes; }; struct wc_lines wc_lines_avx2 (int); +struct wc_lines wc_lines_avx512 (int); diff --git a/src/wc_avx512.c b/src/wc_avx512.c new file mode 100644 index 000000000..41faea646 --- /dev/null +++ b/src/wc_avx512.c @@ -0,0 +1,58 @@ +/* wc_avx512 - Count the number of newlines with avx512 instructions. + Copyright (C) 2021-2025 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include "wc.h" +#include "system.h" +#include "ioblksize.h" + +#include <x86intrin.h> + +/* Read FD and return a summary. */ +extern struct wc_lines +wc_lines_avx512 (int fd) +{ + intmax_t lines = 0; + intmax_t bytes = 0; + + __m512i endlines = _mm512_set1_epi8 ('\n'); + + while (true) + { + __m512i avx_buf[IO_BUFSIZE / sizeof (__m512i)]; + ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf); + if (bytes_read <= 0) + return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; + + bytes += bytes_read; + __m512i *datap = avx_buf; + + while (bytes_read >= 64) + { + __m512i to_match = _mm512_load_si512 (datap); + long long matches = _mm512_cmpeq_epi8_mask (to_match, endlines); + lines += __builtin_popcountll (matches); + datap += 1; + bytes_read -= 64; + } + + /* Finish up any left over bytes */ + char *end = (char *) datap + bytes_read; + for (char *p = (char *) datap; p < end; p++) + lines += *p == '\n'; + } +} |
