aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPádraig Brady <P@draigBrady.com>2025-10-14 16:17:56 +0100
committerPádraig Brady <P@draigBrady.com>2025-10-17 17:26:25 +0100
commit8bc11f80a3eff1afec437383a63953e21a2063cd (patch)
treebacaa365f8152e3ec8a3cd283ee0d61f2c5e7ade
parenttests: du/bigtime: try harder to find a suitable filesystem (diff)
downloadcoreutils-8bc11f80a3eff1afec437383a63953e21a2063cd.tar.gz
coreutils-8bc11f80a3eff1afec437383a63953e21a2063cd.zip
numfmt: support reading numbers with NBSP before unit
* src/numfmt.c (simple_strtod_human): Accept (multi-byte) non-breaking space character between number and unit. Note we restrict this to a single character between number and unit, to allow less ambiguous parsing if multiple blanks are used to delimit fields. * tests/misc/numfmt.pl: Add test cases. * doc/coreutils.texi (numfmt invocation): Fix stale description --delimiter skipping whitespace. * NEWS: Mention the improvement.
-rw-r--r--NEWS2
-rw-r--r--doc/coreutils.texi2
-rw-r--r--src/numfmt.c31
-rwxr-xr-xtests/misc/numfmt.pl23
4 files changed, 47 insertions, 11 deletions
diff --git a/NEWS b/NEWS
index e6053a04b..40d443942 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,8 @@ GNU coreutils NEWS -*- outline -*-
** Improvements
+ numfmt now parses numbers with a non-breaking space character before a unit.
+
wc -l now operates 10% faster on hosts that support AVX512 instructions.
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 26c9209a3..b50e5f724 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -19447,7 +19447,7 @@ Print (to standard error) warning messages about possible erroneous usage.
@itemx --delimiter=@var{d}
@opindex -d
@opindex --delimiter
-Use the character @var{d} as input field separator (default: whitespace).
+Use the character @var{d} as input field separator (default: newline or blank).
Using non-default delimiter turns off automatic padding.
@item --field=@var{fields}
diff --git a/src/numfmt.c b/src/numfmt.c
index 0cc12689e..fbf104b51 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -25,6 +25,7 @@
#include "argmatch.h"
#include "c-ctype.h"
#include "mbswidth.h"
+#include "mcel.h"
#include "quote.h"
#include "skipchars.h"
#include "system.h"
@@ -210,6 +211,11 @@ static int decimal_point_length;
/* debugging for developers. Enables devmsg(). */
static bool dev_debug = false;
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
static inline int
default_scale_base (enum scale_type scale)
@@ -645,15 +651,23 @@ simple_strtod_human (char const *input_str,
{
/* process suffix. */
- /* Skip any blanks between the number and suffix. */
- while (isblank (to_uchar (**endptr)))
- (*endptr)++;
+ /* Skip a single blank or NBSP between the number and suffix. */
+ mcel_t g = mcel_scanz (*endptr);
+ if (c32isblank (g.ch) || c32isnbspace (g.ch))
+ (*endptr) += g.len;
if (**endptr == '\0')
break; /* Treat as no suffix. */
if (!valid_suffix (**endptr))
- return SSE_INVALID_SUFFIX;
+ {
+ /* Trailing blanks are allowed. */
+ *endptr = skip_str_matching (*endptr, newline_or_blank, true);
+ if (**endptr == '\0')
+ break;
+
+ return SSE_INVALID_SUFFIX;
+ }
if (allowed_scaling == scale_none)
return SSE_VALID_BUT_FORBIDDEN_SUFFIX;
@@ -680,6 +694,9 @@ simple_strtod_human (char const *input_str,
*precision = 0; /* Reset, to select precision based on scale. */
+ /* Trailing blanks are allowed. */
+ *endptr = skip_str_matching (*endptr, newline_or_blank, true);
+
break;
}
@@ -1320,12 +1337,6 @@ process_suffixed_number (char *text, long double *result,
return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
}
-static bool
-newline_or_blank (mcel_t g)
-{
- return g.ch == '\n' || c32isblank (g.ch);
-}
-
/* Return a pointer to the beginning of the next field in line.
The line pointer is moved to the end of the next field. */
static char*
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 4dd9718c9..85c888cd8 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -164,6 +164,14 @@ my @Tests =
'--suffix=Foo' . 'x' x 122 . 'y 0',
{OUT => '0Foo' . 'x' x 122 . 'y'}],
['suf-21', "-d '' --from=si '4 '", {OUT => "4"}],
+ # Multiple spaces between number and suffix should be rejected
+ ['suf-22', "-d '' --from=auto '2 K'",
+ {ERR => "$prog: invalid suffix in input: '2 K'\n"},
+ {EXIT => 2}],
+ # Trailing spaces should be accepted
+ ['suf-23', "-d '' --from=auto '2 '", {OUT=>'2'}],
+ ['suf-24', "-d '' --from=auto '2 '", {OUT=>'2'}],
+ ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}],
## GROUPING
@@ -1067,6 +1075,21 @@ my @Locale_Tests =
['lcl-fmt-7', '--format="%0\'\'6f" 1234',{OUT=>"01${lg}234"},
{ENV=>"LC_ALL=$locale"}],
+ # Single blank/NBSP acceptance between number and suffix
+ ['lcl-suf-1', "-d '' --from=auto '2 K'", {OUT => "2000"},
+ {ENV=>"LC_ALL=$locale"}],
+ ['lcl-suf-2', "-d '' --from=auto '2\tK'", {OUT => "2000"},
+ {ENV=>"LC_ALL=$locale"}],
+ # NBSP characters: U+00A0, U+2007, U+202F, U+2060
+ ['lcl-suf-3', "--from=auto '2\xc2\xa0K'", {OUT => "2000"},
+ {ENV=>"LC_ALL=$locale"}],
+ ['lcl-suf-4', "--from=auto '2\xe2\x80\x87Ki'", {OUT => "2048"},
+ {ENV=>"LC_ALL=$locale"}],
+ ['lcl-suf-5', "--from=auto '2\xe2\x80\xafK'", {OUT => "2000"},
+ {ENV=>"LC_ALL=$locale"}],
+ ['lcl-suf-6', "--from=auto '2\xe2\x81\xa0Ki'", {OUT => "2048"},
+ {ENV=>"LC_ALL=$locale"}],
+
);
if ($locale ne 'C')
{