aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMathieu Bordere <mathieu@letmetweakit.com>2025-09-24 12:41:06 +0200
committerPádraig Brady <P@draigBrady.com>2025-09-30 14:09:37 +0100
commit67e9068c5f5fdae5666279717a4c19bdfe5c21de (patch)
tree3433b931659ca653d12b7d137e9f76cfdb454da2 /src
parentmaint: update valgrind instructions (diff)
downloadcoreutils-67e9068c5f5fdae5666279717a4c19bdfe5c21de.tar.gz
coreutils-67e9068c5f5fdae5666279717a4c19bdfe5c21de.zip
wc: add AVX512 function for line counting
* configure.ac: Add detection of AVX512 intrinsics for wc. * src/local.mk: Build AVX512 wc libraries. * src/wc.c: Add runtime detection of AVX512 intrinsics and call appropriate function when detected. * src/wc.h (wc_lines_avx512): Declare function. * tests/wc/wc-cpu.sh: Add a test that disables AVX512 intrinsics. * src/wc_avx512.c: New file containing the wc -l implementation using AVX512. The logic and code is reused from the AVX2 implementation with slight adaptations. Replaced __builtin_popcount by __builtin_popcountll and the combination of _mm256_cmpeq_epi8 and _mm256_movemask_epi8 by a single call to _mm512_cmpeq_epi8_mask. * NEWS: Mention the improvement.
Diffstat (limited to 'src')
-rw-r--r--src/local.mk7
-rw-r--r--src/wc.c30
-rw-r--r--src/wc.h1
-rw-r--r--src/wc_avx512.c58
4 files changed, 92 insertions, 4 deletions
diff --git a/src/local.mk b/src/local.mk
index f8a4bcffb..a55c9f990 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -484,6 +484,13 @@ src_expand_SOURCES = src/expand.c src/expand-common.c
src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
src_wc_SOURCES = src/wc.c
+if USE_AVX512_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_avx512.a
+src_libwc_avx512_a_SOURCES = src/wc_avx512.c
+wc_avx512_ldadd = src/libwc_avx512.a
+src_wc_LDADD += $(wc_avx512_ldadd)
+src_libwc_avx512_a_CFLAGS = -mavx512bw -mavx512f $(AM_CFLAGS)
+endif
if USE_AVX2_WC_LINECOUNT
noinst_LIBRARIES += src/libwc_avx2.a
src_libwc_avx2_a_SOURCES = src/wc_avx2.c
diff --git a/src/wc.c b/src/wc.c
index 777277f23..243399393 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -134,14 +134,29 @@ static enum total_type total_mode = total_auto;
static bool
avx2_supported (void)
{
- bool avx_enabled = cpu_supports ("avx2");
-
+ bool avx2_enabled = cpu_supports ("avx2");
if (debug)
- error (0, 0, (avx_enabled
+ error (0, 0, (avx2_enabled
? _("using avx2 hardware support")
: _("avx2 support not detected")));
- return avx_enabled;
+ return avx2_enabled;
+}
+#endif
+
+#ifdef USE_AVX512_WC_LINECOUNT
+static bool
+avx512_supported (void)
+{
+ bool avx512_enabled = (cpu_supports ("avx512f")
+ && cpu_supports ("avx512bw"));
+
+ if (debug)
+ error (0, 0, (avx512_enabled
+ ? _("using avx512 hardware support")
+ : _("avx512 support not detected")));
+
+ return avx512_enabled;
}
#endif
@@ -246,6 +261,13 @@ write_counts (uintmax_t lines,
static struct wc_lines
wc_lines (int fd)
{
+#ifdef USE_AVX512_WC_LINECOUNT
+ static signed char use_avx512;
+ if (!use_avx512)
+ use_avx512 = avx512_supported () ? 1 : -1;
+ if (0 < use_avx512)
+ return wc_lines_avx512 (fd);
+#endif
#ifdef USE_AVX2_WC_LINECOUNT
static signed char use_avx2;
if (!use_avx2)
diff --git a/src/wc.h b/src/wc.h
index a6b4c9e84..f151e92f2 100644
--- a/src/wc.h
+++ b/src/wc.h
@@ -1,3 +1,4 @@
#include <stdint.h>
struct wc_lines { int err; intmax_t lines; intmax_t bytes; };
struct wc_lines wc_lines_avx2 (int);
+struct wc_lines wc_lines_avx512 (int);
diff --git a/src/wc_avx512.c b/src/wc_avx512.c
new file mode 100644
index 000000000..41faea646
--- /dev/null
+++ b/src/wc_avx512.c
@@ -0,0 +1,58 @@
+/* wc_avx512 - Count the number of newlines with avx512 instructions.
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "wc.h"
+#include "system.h"
+#include "ioblksize.h"
+
+#include <x86intrin.h>
+
+/* Read FD and return a summary. */
+extern struct wc_lines
+wc_lines_avx512 (int fd)
+{
+ intmax_t lines = 0;
+ intmax_t bytes = 0;
+
+ __m512i endlines = _mm512_set1_epi8 ('\n');
+
+ while (true)
+ {
+ __m512i avx_buf[IO_BUFSIZE / sizeof (__m512i)];
+ ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf);
+ if (bytes_read <= 0)
+ return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
+
+ bytes += bytes_read;
+ __m512i *datap = avx_buf;
+
+ while (bytes_read >= 64)
+ {
+ __m512i to_match = _mm512_load_si512 (datap);
+ long long matches = _mm512_cmpeq_epi8_mask (to_match, endlines);
+ lines += __builtin_popcountll (matches);
+ datap += 1;
+ bytes_read -= 64;
+ }
+
+ /* Finish up any left over bytes */
+ char *end = (char *) datap + bytes_read;
+ for (char *p = (char *) datap; p < end; p++)
+ lines += *p == '\n';
+ }
+}