mirror of
https://github.com/PCRE2Project/pcre2.git
synced 2025-10-17 23:57:23 +08:00
pcre2grep: add $& as an alias for $0 (#519)
Perl does not use $0 anymore to refer to the text of the matched subject and `pcre2_substitute()` was recently updated to also provide that value using the variable Perl prefers: `$&`. In a similar context, either as part of the formatted output from a match or during the processing of a callback, teach pcre2grep to also populate $&. While at it, update the ChangeLog with recent changes.
This commit is contained in:

committed by
GitHub

parent
223941425f
commit
0d087cce82
2
NEWS
2
NEWS
@@ -52,7 +52,7 @@ a list). Those that are not bugfixes or code tidies are:
|
||||
matches the "fullwidth" versions of hex digits. PCRE2_EXTRA_ASCII_DIGIT can
|
||||
be used to keep it ASCII only.
|
||||
|
||||
* Make PCRE2_UCP the default in UTF mode in pcre2grep and add -no_ucp,
|
||||
* Make PCRE2_UCP the default in UTF mode in pcre2grep and add --no-ucp,
|
||||
--case-restrict and --posix-digit.
|
||||
|
||||
* Add --group-separator and --no-group-separator to pcre2grep.
|
||||
|
@@ -637,6 +637,8 @@ echo "RC=$?" >>testtrygrep
|
||||
echo "---------------------------- Test 120 ------------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HO '$0:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HO '$&:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m 1 -O '$0:$a$b$e$f$r$t$v' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HO '${X}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1
|
||||
|
@@ -4082,7 +4082,7 @@ processing a substitution such as:
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
The default case transformations applied by PCRE2 are reasonably complete, and,
|
||||
The default case transformations applied by PCRE2 are reasonably complete, and,
|
||||
in UTF or UCP mode, perform the basic locale-invariant case transformations as
|
||||
specified by Unicode. This is suitable for the internal (invisible)
|
||||
case-equivalence procedures used during pattern matching, but an application
|
||||
|
@@ -724,9 +724,9 @@ text.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
decimal number; $& (or the legacy $0) substitutes the whole match. If the
|
||||
number is greater than the number of capturing substrings, or if the capture
|
||||
is unset, the replacement is empty.
|
||||
<br>
|
||||
<br>
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
@@ -1025,9 +1025,9 @@ available, provided that callouts were not completely disabled when
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
above). However, $0 or $& cannot be used to insert a matched substring because
|
||||
the match is still in progress. Instead, the single character '0' is inserted.
|
||||
Any syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
@@ -1057,9 +1057,9 @@ arguments:
|
||||
</pre>
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character. These are the same as for the <b>--output</b>
|
||||
(<b>-O</b>) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
(<b>-O</b>) option documented above, except that $0 or $& cannot insert the
|
||||
matched string because the match is still in progress. Instead, the character
|
||||
'0' is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
<pre>
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
|
@@ -188,8 +188,8 @@ REVISION
|
||||
|
||||
PCRE2 10.38 27 August 2021 PCRE2(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2API(3) Library Functions Manual PCRE2API(3)
|
||||
|
||||
|
||||
@@ -4209,8 +4209,8 @@ REVISION
|
||||
|
||||
PCRE2 10.45 04 October 2024 PCRE2API(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3)
|
||||
|
||||
|
||||
@@ -4835,8 +4835,8 @@ REVISION
|
||||
|
||||
PCRE2 10.44 15 April 2024 PCRE2BUILD(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3)
|
||||
|
||||
|
||||
@@ -5268,8 +5268,8 @@ REVISION
|
||||
|
||||
PCRE2 10.43 19 January 2024 PCRE2CALLOUT(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3)
|
||||
|
||||
|
||||
@@ -5522,8 +5522,8 @@ REVISION
|
||||
|
||||
PCRE2 10.45 01 September 2024 PCRE2COMPAT(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2JIT(3) Library Functions Manual PCRE2JIT(3)
|
||||
|
||||
|
||||
@@ -5977,8 +5977,8 @@ REVISION
|
||||
|
||||
PCRE2 10.45 23 July 2024 PCRE2JIT(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3)
|
||||
|
||||
|
||||
@@ -6060,8 +6060,8 @@ REVISION
|
||||
|
||||
PCRE2 10.43 1 August 2023 PCRE2LIMITS(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3)
|
||||
|
||||
|
||||
@@ -6293,8 +6293,8 @@ REVISION
|
||||
|
||||
PCRE2 10.45 30 August 2024 PCRE2MATCHING(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3)
|
||||
|
||||
|
||||
@@ -6676,8 +6676,8 @@ REVISION
|
||||
|
||||
PCRE2 10.34 04 September 2019 PCRE2PARTIAL(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3)
|
||||
|
||||
|
||||
@@ -10368,8 +10368,8 @@ REVISION
|
||||
|
||||
PCRE2 10.45 21 Sepbember 2024 PCRE2PATTERN(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3)
|
||||
|
||||
|
||||
@@ -10623,8 +10623,8 @@ REVISION
|
||||
|
||||
PCRE2 10.41 27 July 2022 PCRE2PERFORM(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3)
|
||||
|
||||
|
||||
@@ -10981,8 +10981,8 @@ REVISION
|
||||
|
||||
PCRE2 10.43 19 January 2024 PCRE2POSIX(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3)
|
||||
|
||||
|
||||
@@ -11265,8 +11265,8 @@ REVISION
|
||||
|
||||
PCRE2 10.32 27 June 2018 PCRE2SERIALIZE(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3)
|
||||
|
||||
|
||||
@@ -11923,8 +11923,8 @@ REVISION
|
||||
|
||||
PCRE2 10.45 24 September 2024 PCRE2SYNTAX(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3)
|
||||
|
||||
|
||||
@@ -12393,5 +12393,5 @@ REVISION
|
||||
|
||||
PCRE2 10.45 22 July 2024 PCRE2UNICODE(3)
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -4064,7 +4064,7 @@ processing a substitution such as:
|
||||
.sp
|
||||
pcre2_substitute(..., "\e\eU$1", ...)
|
||||
.P
|
||||
The default case transformations applied by PCRE2 are reasonably complete, and,
|
||||
The default case transformations applied by PCRE2 are reasonably complete, and,
|
||||
in UTF or UCP mode, perform the basic locale-invariant case transformations as
|
||||
specified by Unicode. This is suitable for the internal (invisible)
|
||||
case-equivalence procedures used during pattern matching, but an application
|
||||
|
@@ -1,4 +1,4 @@
|
||||
.TH PCRE2DEMO 3 " 4 October 2024" "PCRE2 10.44"
|
||||
.TH PCRE2DEMO 3 " 8 October 2024" "PCRE2 10.44"
|
||||
.\"AUTOMATICALLY GENERATED BY PrepareRelease - do not EDIT!
|
||||
.SH NAME
|
||||
PCRE2DEMO - A demonstration C program for PCRE2
|
||||
|
@@ -629,9 +629,9 @@ contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
decimal number; $& (or the legacy $0) substitutes the whole match. If the
|
||||
number is greater than the number of capturing substrings, or if the capture
|
||||
is unset, the replacement is empty.
|
||||
.sp
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
@@ -914,9 +914,9 @@ available, provided that callouts were not completely disabled when
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
above). However, $0 or $& cannot be used to insert a matched substring because
|
||||
the match is still in progress. Instead, the single character '0' is inserted.
|
||||
Any syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
@@ -945,9 +945,9 @@ arguments:
|
||||
.sp
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character. These are the same as for the \fB--output\fP
|
||||
(\fB-O\fP) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
(\fB-O\fP) option documented above, except that $0 or $& cannot insert the
|
||||
matched string because the match is still in progress. Instead, the character
|
||||
'0' is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
.sp
|
||||
echo -e "abcde\en12345" | pcre2grep \e
|
||||
|
@@ -702,188 +702,188 @@ OPTIONS
|
||||
captured substrings into the text.
|
||||
|
||||
$<digits> or ${<digits>} is replaced by the captured sub-
|
||||
string of the given decimal number; zero substitutes the
|
||||
whole match. If the number is greater than the number of cap-
|
||||
turing substrings, or if the capture is unset, the replace-
|
||||
ment is empty.
|
||||
string of the given decimal number; $& (or the legacy $0)
|
||||
substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is un-
|
||||
set, the replacement is empty.
|
||||
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||
$v by vertical tab.
|
||||
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose
|
||||
code point is the given octal number. In the first form, up
|
||||
to three octal digits are processed. When more digits are
|
||||
needed in Unicode mode to specify a wide character, the sec-
|
||||
code point is the given octal number. In the first form, up
|
||||
to three octal digits are processed. When more digits are
|
||||
needed in Unicode mode to specify a wide character, the sec-
|
||||
ond form must be used.
|
||||
|
||||
$x<digits> or $x{<digits>} is replaced by the character rep-
|
||||
resented by the given hexadecimal number. In the first form,
|
||||
up to two hexadecimal digits are processed. When more digits
|
||||
are needed in Unicode mode to specify a wide character, the
|
||||
$x<digits> or $x{<digits>} is replaced by the character rep-
|
||||
resented by the given hexadecimal number. In the first form,
|
||||
up to two hexadecimal digits are processed. When more digits
|
||||
are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
|
||||
Any other character is substituted by itself. In particular,
|
||||
Any other character is substituted by itself. In particular,
|
||||
$$ is replaced by a single dollar.
|
||||
|
||||
-o, --only-matching
|
||||
Show only the part of the line that matched a pattern instead
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v (in-
|
||||
vert the sense of the match to find non-matching lines), no
|
||||
output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v (in-
|
||||
vert the sense of the match to find non-matching lines), no
|
||||
output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
line. This option is mutually exclusive with --output,
|
||||
--file-offsets and --line-offsets.
|
||||
|
||||
-onumber, --only-matching=number
|
||||
Show only the part of the line that matched the capturing
|
||||
Show only the part of the line that matched the capturing
|
||||
parentheses of the given number. Up to 50 capturing parenthe-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ber specified by -o is greater than the limit.
|
||||
|
||||
-o0 is the same as -o without a number. Because these options
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for ex-
|
||||
ample, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for ex-
|
||||
ample, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
file name or line number are being output.
|
||||
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
(but see the next but one option).
|
||||
|
||||
--om-capture=number
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
by -o. The default is 50.
|
||||
|
||||
--om-separator=text
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
coloured.
|
||||
|
||||
-P, --no-ucp
|
||||
Starting from release 10.43, when UTF/Unicode mode is speci-
|
||||
fied with -u or -U, the PCRE2_UCP option is used by default.
|
||||
Starting from release 10.43, when UTF/Unicode mode is speci-
|
||||
fied with -u or -U, the PCRE2_UCP option is used by default.
|
||||
This means that the POSIX classes in patterns match more than
|
||||
just ASCII characters. For example, [:digit:] matches any
|
||||
Unicode decimal digit. The --no-ucp option suppresses
|
||||
PCRE2_UCP, thus restricting the POSIX classes to ASCII char-
|
||||
acters, as was the case in earlier releases. Note that there
|
||||
are now more fine-grained option settings within patterns
|
||||
that affect individual classes. For example, when in UCP
|
||||
just ASCII characters. For example, [:digit:] matches any
|
||||
Unicode decimal digit. The --no-ucp option suppresses
|
||||
PCRE2_UCP, thus restricting the POSIX classes to ASCII char-
|
||||
acters, as was the case in earlier releases. Note that there
|
||||
are now more fine-grained option settings within patterns
|
||||
that affect individual classes. For example, when in UCP
|
||||
mode, the sequence (?aP) restricts [:word:] to ASCII letters,
|
||||
while allowing \w to match Unicode letters and digits.
|
||||
|
||||
--posix-pattern-file
|
||||
When patterns are provided with the -f option, do not trim
|
||||
trailing spaces or ignore empty lines in a similar way than
|
||||
When patterns are provided with the -f option, do not trim
|
||||
trailing spaces or ignore empty lines in a similar way than
|
||||
other grep tools. To keep the behaviour consistent with older
|
||||
versions, if the pattern read was terminated with CRLF (as
|
||||
versions, if the pattern read was terminated with CRLF (as
|
||||
character literals) then both characters won't be included as
|
||||
part of it, so if you really need to have pattern ending in
|
||||
'\r', use a escape sequence or provide it by a different
|
||||
part of it, so if you really need to have pattern ending in
|
||||
'\r', use a escape sequence or provide it by a different
|
||||
method.
|
||||
|
||||
-q, --quiet
|
||||
Work quietly, that is, display nothing except error messages.
|
||||
The exit status indicates whether or not any matches were
|
||||
The exit status indicates whether or not any matches were
|
||||
found.
|
||||
|
||||
-r, --recursive
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to "re-
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to "re-
|
||||
curse".
|
||||
|
||||
--recursion-limit=number
|
||||
This is an obsolete synonym for --depth-limit. See --match-
|
||||
This is an obsolete synonym for --depth-limit. See --match-
|
||||
limit above for details.
|
||||
|
||||
-s, --no-messages
|
||||
Suppress error messages about non-existent or unreadable
|
||||
files. Such files are quietly skipped. However, the return
|
||||
Suppress error messages about non-existent or unreadable
|
||||
files. Such files are quietly skipped. However, the return
|
||||
code is still 2, even if matches were found in other files.
|
||||
|
||||
-t, --total-count
|
||||
This option is useful when scanning more than one file. If
|
||||
used on its own, -t suppresses all output except for a grand
|
||||
total number of matching lines (or non-matching lines if -v
|
||||
This option is useful when scanning more than one file. If
|
||||
used on its own, -t suppresses all output except for a grand
|
||||
total number of matching lines (or non-matching lines if -v
|
||||
is used) in all the files. If -t is used with -c, a grand to-
|
||||
tal is output except when the previous output is just one
|
||||
line. In other words, it is not output when just one file's
|
||||
count is listed. If file names are being output, the grand
|
||||
total is preceded by "TOTAL:". Otherwise, it appears as just
|
||||
another number. The -t option is ignored when used with -L
|
||||
(list files without matches), because the grand total would
|
||||
tal is output except when the previous output is just one
|
||||
line. In other words, it is not output when just one file's
|
||||
count is listed. If file names are being output, the grand
|
||||
total is preceded by "TOTAL:". Otherwise, it appears as just
|
||||
another number. The -t option is ignored when used with -L
|
||||
(list files without matches), because the grand total would
|
||||
always be zero.
|
||||
|
||||
-u, --utf Operate in UTF/Unicode mode. This option is available only if
|
||||
PCRE2 has been compiled with UTF-8 support. All patterns (in-
|
||||
cluding those for any --exclude and --include options) and
|
||||
all lines that are scanned must be valid strings of UTF-8
|
||||
cluding those for any --exclude and --include options) and
|
||||
all lines that are scanned must be valid strings of UTF-8
|
||||
characters. If an invalid UTF-8 string is encountered, an er-
|
||||
ror occurs.
|
||||
|
||||
-U, --utf-allow-invalid
|
||||
As --utf, but in addition subject lines may contain invalid
|
||||
UTF-8 code unit sequences. These can never form part of any
|
||||
pattern match. Patterns themselves, however, must still be
|
||||
As --utf, but in addition subject lines may contain invalid
|
||||
UTF-8 code unit sequences. These can never form part of any
|
||||
pattern match. Patterns themselves, however, must still be
|
||||
valid UTF-8 strings. This facility allows valid UTF-8 strings
|
||||
to be sought within arbitrary byte sequences in executable or
|
||||
other binary files. For more details about matching in non-
|
||||
other binary files. For more details about matching in non-
|
||||
valid UTF-8 strings, see the pcre2unicode(3) documentation.
|
||||
|
||||
-V, --version
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
command line is ignored.
|
||||
|
||||
-v, --invert-match
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found. When
|
||||
this option is set, options such as --only-matching and
|
||||
--output, which specify parts of a match that are to be out-
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found. When
|
||||
this option is set, options such as --only-matching and
|
||||
--output, which specify parts of a match that are to be out-
|
||||
put, are ignored.
|
||||
|
||||
-w, --word-regex, --word-regexp
|
||||
Force the patterns only to match "words". That is, there must
|
||||
be a word boundary at the start and end of each matched
|
||||
string. This is equivalent to having "\b(?:" at the start of
|
||||
each pattern, and ")\b" at the end. This option applies only
|
||||
to the patterns that are matched against the contents of
|
||||
files; it does not apply to patterns specified by any of the
|
||||
be a word boundary at the start and end of each matched
|
||||
string. This is equivalent to having "\b(?:" at the start of
|
||||
each pattern, and ")\b" at the end. This option applies only
|
||||
to the patterns that are matched against the contents of
|
||||
files; it does not apply to patterns specified by any of the
|
||||
--include or --exclude options.
|
||||
|
||||
-x, --line-regex, --line-regexp
|
||||
Force the patterns to start matching only at the beginnings
|
||||
of lines, and in addition, require them to match entire
|
||||
Force the patterns to start matching only at the beginnings
|
||||
of lines, and in addition, require them to match entire
|
||||
lines. In multiline mode the match may be more than one line.
|
||||
This is equivalent to having "^(?:" at the start of each pat-
|
||||
tern and ")$" at the end. This option applies only to the
|
||||
patterns that are matched against the contents of files; it
|
||||
does not apply to patterns specified by any of the --include
|
||||
tern and ")$" at the end. This option applies only to the
|
||||
patterns that are matched against the contents of files; it
|
||||
does not apply to patterns specified by any of the --include
|
||||
or --exclude options.
|
||||
|
||||
-Z, --null
|
||||
Terminate files names in the regular output with a zero byte
|
||||
(the NUL character) instead of what would normally appear.
|
||||
This is useful when file names contain unusual characters
|
||||
such as colons, hyphens, or even newlines. The option does
|
||||
Terminate files names in the regular output with a zero byte
|
||||
(the NUL character) instead of what would normally appear.
|
||||
This is useful when file names contain unusual characters
|
||||
such as colons, hyphens, or even newlines. The option does
|
||||
not apply to file names in error messages.
|
||||
|
||||
|
||||
@@ -897,141 +897,141 @@ ENVIRONMENT VARIABLES
|
||||
|
||||
NEWLINES
|
||||
|
||||
The -N (--newline) option allows pcre2grep to scan files with newline
|
||||
conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation
|
||||
of files specified by the -f, --file-list, --exclude-from, or --in-
|
||||
The -N (--newline) option allows pcre2grep to scan files with newline
|
||||
conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation
|
||||
of files specified by the -f, --file-list, --exclude-from, or --in-
|
||||
clude-from options.
|
||||
|
||||
Any parts of the scanned input files that are written to the standard
|
||||
output are copied with whatever newline sequences they have in the in-
|
||||
put. However, if the final line of a file is output, and it does not
|
||||
end with a newline sequence, a newline sequence is added. If the new-
|
||||
line setting is CR, LF, CRLF or NUL, that line ending is output; for
|
||||
Any parts of the scanned input files that are written to the standard
|
||||
output are copied with whatever newline sequences they have in the in-
|
||||
put. However, if the final line of a file is output, and it does not
|
||||
end with a newline sequence, a newline sequence is added. If the new-
|
||||
line setting is CR, LF, CRLF or NUL, that line ending is output; for
|
||||
the other settings (ANYCRLF or ANY) a single NL is used.
|
||||
|
||||
The newline setting does not affect the way in which pcre2grep writes
|
||||
newlines in informational messages to the standard output and error
|
||||
streams. Under Windows, the standard output is set to be binary, so
|
||||
that "\r\n" at the ends of output lines that are copied from the input
|
||||
is not converted to "\r\r\n" by the C I/O library. This means that any
|
||||
messages written to the standard output must end with "\r\n". For all
|
||||
other operating systems, and for all messages to the standard error
|
||||
The newline setting does not affect the way in which pcre2grep writes
|
||||
newlines in informational messages to the standard output and error
|
||||
streams. Under Windows, the standard output is set to be binary, so
|
||||
that "\r\n" at the ends of output lines that are copied from the input
|
||||
is not converted to "\r\r\n" by the C I/O library. This means that any
|
||||
messages written to the standard output must end with "\r\n". For all
|
||||
other operating systems, and for all messages to the standard error
|
||||
stream, "\n" is used.
|
||||
|
||||
|
||||
OPTIONS COMPATIBILITY WITH GNU GREP
|
||||
|
||||
Many of the short and long forms of pcre2grep's options are the same as
|
||||
in the GNU grep program. Any long option of the form --xxx-regexp (GNU
|
||||
terminology) is also available as --xxx-regex (PCRE2 terminology).
|
||||
However, the --case-restrict, --depth-limit, -E, --file-list, --file-
|
||||
in the GNU grep program. Any long option of the form --xxx-regexp (GNU
|
||||
terminology) is also available as --xxx-regex (PCRE2 terminology).
|
||||
However, the --case-restrict, --depth-limit, -E, --file-list, --file-
|
||||
offsets, --heap-limit, --include-dir, --line-offsets, --locale,
|
||||
--match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa-
|
||||
tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are
|
||||
--match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa-
|
||||
tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are
|
||||
specific to pcre2grep, as is the use of the --only-matching option with
|
||||
a capturing parentheses number.
|
||||
|
||||
Although most of the common options work the same way, a few are dif-
|
||||
ferent in pcre2grep. For example, the --include option's argument is a
|
||||
Although most of the common options work the same way, a few are dif-
|
||||
ferent in pcre2grep. For example, the --include option's argument is a
|
||||
glob for GNU grep, but in pcre2grep it is a regular expression to which
|
||||
the -i option applies. If both the -c and -l options are given, GNU
|
||||
grep lists only file names, without counts, but pcre2grep gives the
|
||||
the -i option applies. If both the -c and -l options are given, GNU
|
||||
grep lists only file names, without counts, but pcre2grep gives the
|
||||
counts as well.
|
||||
|
||||
|
||||
OPTIONS WITH DATA
|
||||
|
||||
There are four different ways in which an option with data can be spec-
|
||||
ified. If a short form option is used, the data may follow immedi-
|
||||
ified. If a short form option is used, the data may follow immedi-
|
||||
ately, or (with one exception) in the next command line item. For exam-
|
||||
ple:
|
||||
|
||||
-f/some/file
|
||||
-f /some/file
|
||||
|
||||
The exception is the -o option, which may appear with or without data.
|
||||
Because of this, if data is present, it must follow immediately in the
|
||||
The exception is the -o option, which may appear with or without data.
|
||||
Because of this, if data is present, it must follow immediately in the
|
||||
same item, for example -o3.
|
||||
|
||||
If a long form option is used, the data may appear in the same command
|
||||
line item, separated by an equals character, or (with two exceptions)
|
||||
If a long form option is used, the data may appear in the same command
|
||||
line item, separated by an equals character, or (with two exceptions)
|
||||
it may appear in the next command line item. For example:
|
||||
|
||||
--file=/some/file
|
||||
--file /some/file
|
||||
|
||||
Note, however, that if you want to supply a file name beginning with ~
|
||||
as data in a shell command, and have the shell expand ~ to a home di-
|
||||
rectory, you must separate the file name from the option, because the
|
||||
Note, however, that if you want to supply a file name beginning with ~
|
||||
as data in a shell command, and have the shell expand ~ to a home di-
|
||||
rectory, you must separate the file name from the option, because the
|
||||
shell does not treat ~ specially unless it is at the start of an item.
|
||||
|
||||
The exceptions to the above are the --colour (or --color) and --only-
|
||||
matching options, for which the data is optional. If one of these op-
|
||||
tions does have data, it must be given in the first form, using an
|
||||
The exceptions to the above are the --colour (or --color) and --only-
|
||||
matching options, for which the data is optional. If one of these op-
|
||||
tions does have data, it must be given in the first form, using an
|
||||
equals character. Otherwise pcre2grep will assume that it has no data.
|
||||
|
||||
|
||||
USING PCRE2'S CALLOUT FACILITY
|
||||
|
||||
pcre2grep has, by default, support for calling external programs or
|
||||
scripts or echoing specific strings during matching by making use of
|
||||
PCRE2's callout facility. However, this support can be completely or
|
||||
partially disabled when pcre2grep is built. You can find out whether
|
||||
your binary has support for callouts by running it with the --help op-
|
||||
tion. If callout support is completely disabled, callouts in patterns
|
||||
are forbidden by pcre2grep. If the facility is partially disabled,
|
||||
calling external programs is not supported, and callouts that request
|
||||
pcre2grep has, by default, support for calling external programs or
|
||||
scripts or echoing specific strings during matching by making use of
|
||||
PCRE2's callout facility. However, this support can be completely or
|
||||
partially disabled when pcre2grep is built. You can find out whether
|
||||
your binary has support for callouts by running it with the --help op-
|
||||
tion. If callout support is completely disabled, callouts in patterns
|
||||
are forbidden by pcre2grep. If the facility is partially disabled,
|
||||
calling external programs is not supported, and callouts that request
|
||||
it are ignored.
|
||||
|
||||
A callout in a PCRE2 pattern is of the form (?C<arg>) where the argu-
|
||||
ment is either a number or a quoted string (see the pcre2callout docu-
|
||||
mentation for details). Numbered callouts are ignored by pcre2grep;
|
||||
A callout in a PCRE2 pattern is of the form (?C<arg>) where the argu-
|
||||
ment is either a number or a quoted string (see the pcre2callout docu-
|
||||
mentation for details). Numbered callouts are ignored by pcre2grep;
|
||||
only callouts with string arguments are useful.
|
||||
|
||||
Echoing a specific string
|
||||
|
||||
Starting the callout string with a pipe character invokes an echoing
|
||||
Starting the callout string with a pipe character invokes an echoing
|
||||
facility that avoids calling an external program or script. This facil-
|
||||
ity is always available, provided that callouts were not completely
|
||||
disabled when pcre2grep was built. The rest of the callout string is
|
||||
processed as a zero-terminated string, which means it should not con-
|
||||
tain any internal binary zeros. It is written to the output, having
|
||||
first been passed through the same escape processing as text from the
|
||||
--output (-O) option (see above). However, $0 cannot be used to insert
|
||||
a matched substring because the match is still in progress. Instead,
|
||||
the single character '0' is inserted. Any syntax errors in the string
|
||||
(for example, a dollar not followed by another character) causes the
|
||||
callout to be ignored. No terminator is added to the output string, so
|
||||
if you want a newline, you must include it explicitly using the escape
|
||||
$n. For example:
|
||||
ity is always available, provided that callouts were not completely
|
||||
disabled when pcre2grep was built. The rest of the callout string is
|
||||
processed as a zero-terminated string, which means it should not con-
|
||||
tain any internal binary zeros. It is written to the output, having
|
||||
first been passed through the same escape processing as text from the
|
||||
--output (-O) option (see above). However, $0 or $& cannot be used to
|
||||
insert a matched substring because the match is still in progress. In-
|
||||
stead, the single character '0' is inserted. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) causes
|
||||
the callout to be ignored. No terminator is added to the output string,
|
||||
so if you want a newline, you must include it explicitly using the es-
|
||||
cape $n. For example:
|
||||
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
|
||||
Matching continues normally after the string is output. If you want to
|
||||
see only the callout output but not any output from an actual match,
|
||||
Matching continues normally after the string is output. If you want to
|
||||
see only the callout output but not any output from an actual match,
|
||||
you should end the pattern with (*FAIL).
|
||||
|
||||
Calling external programs or scripts
|
||||
|
||||
This facility can be independently disabled when pcre2grep is built. It
|
||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||
where lib$spawn() is used, and for any Unix-like environment where
|
||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||
where lib$spawn() is used, and for any Unix-like environment where
|
||||
fork() and execv() are available.
|
||||
|
||||
If the callout string does not start with a pipe (vertical bar) charac-
|
||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||
ters. The first substring must be an executable name, with the follow-
|
||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||
ters. The first substring must be an executable name, with the follow-
|
||||
ing substrings specifying arguments:
|
||||
|
||||
executable_name|arg1|arg2|...
|
||||
|
||||
Any substring (including the executable name) may contain escape se-
|
||||
quences started by a dollar character. These are the same as for the
|
||||
--output (-O) option documented above, except that $0 cannot insert the
|
||||
matched string because the match is still in progress. Instead, the
|
||||
character '0' is inserted. If you need a literal dollar or pipe charac-
|
||||
ter in any substring, use $$ or $| respectively. Here is an example:
|
||||
Any substring (including the executable name) may contain escape se-
|
||||
quences started by a dollar character. These are the same as for the
|
||||
--output (-O) option documented above, except that $0 or $& cannot in-
|
||||
sert the matched string because the match is still in progress. In-
|
||||
stead, the character substring, use $$ or $| respectively. Here is an
|
||||
example:
|
||||
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
'(?x)(.)(..(.))
|
||||
@@ -1044,43 +1044,43 @@ USING PCRE2'S CALLOUT FACILITY
|
||||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
|
||||
The parameters for the system call that is used to run the program or
|
||||
The parameters for the system call that is used to run the program or
|
||||
script are zero-terminated strings. This means that binary zero charac-
|
||||
ters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in
|
||||
the string (for example, a dollar not followed by another character)
|
||||
ters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in
|
||||
the string (for example, a dollar not followed by another character)
|
||||
causes the callout to be ignored. If running the program fails for any
|
||||
reason (including the non-existence of the executable), a local match-
|
||||
reason (including the non-existence of the executable), a local match-
|
||||
ing failure occurs and the matcher backtracks in the normal way.
|
||||
|
||||
|
||||
MATCHING ERRORS
|
||||
|
||||
It is possible to supply a regular expression that takes a very long
|
||||
time to fail to match certain lines. Such patterns normally involve
|
||||
nested indefinite repeats, for example: (a+)*\d when matched against a
|
||||
line of a's with no final digit. The PCRE2 matching function has a re-
|
||||
source limit that causes it to abort in these circumstances. If this
|
||||
happens, pcre2grep outputs an error message and the line that caused
|
||||
the problem to the standard error stream. If there are more than 20
|
||||
It is possible to supply a regular expression that takes a very long
|
||||
time to fail to match certain lines. Such patterns normally involve
|
||||
nested indefinite repeats, for example: (a+)*\d when matched against a
|
||||
line of a's with no final digit. The PCRE2 matching function has a re-
|
||||
source limit that causes it to abort in these circumstances. If this
|
||||
happens, pcre2grep outputs an error message and the line that caused
|
||||
the problem to the standard error stream. If there are more than 20
|
||||
such errors, pcre2grep gives up.
|
||||
|
||||
The --match-limit option of pcre2grep can be used to set the overall
|
||||
resource limit. There are also other limits that affect the amount of
|
||||
memory used during matching; see the discussion of --heap-limit and
|
||||
The --match-limit option of pcre2grep can be used to set the overall
|
||||
resource limit. There are also other limits that affect the amount of
|
||||
memory used during matching; see the discussion of --heap-limit and
|
||||
--depth-limit above.
|
||||
|
||||
|
||||
DIAGNOSTICS
|
||||
|
||||
Exit status is 0 if any matches were found, 1 if no matches were found,
|
||||
and 2 for syntax errors, overlong lines, non-existent or inaccessible
|
||||
files (even if matches were found in other files) or too many matching
|
||||
and 2 for syntax errors, overlong lines, non-existent or inaccessible
|
||||
files (even if matches were found in other files) or too many matching
|
||||
errors. Using the -s option to suppress error messages about inaccessi-
|
||||
ble files does not affect the return code.
|
||||
|
||||
When run under VMS, the return code is placed in the symbol
|
||||
PCRE2GREP_RC because VMS does not distinguish between exit(0) and
|
||||
When run under VMS, the return code is placed in the symbol
|
||||
PCRE2GREP_RC because VMS does not distinguish between exit(0) and
|
||||
exit(1).
|
||||
|
||||
|
||||
|
@@ -2024,11 +2024,23 @@ switch (*(++string))
|
||||
*last = string;
|
||||
return DDE_ERROR;
|
||||
|
||||
case '&':
|
||||
/* In a callout, no capture is available. Return the character '0' for
|
||||
consistency with $0. */
|
||||
|
||||
if (callout) *value = '0';
|
||||
else
|
||||
{
|
||||
*value = 0;
|
||||
rc = DDE_CAPTURE;
|
||||
}
|
||||
break;
|
||||
|
||||
case '{':
|
||||
brace = TRUE;
|
||||
string++;
|
||||
if (!isdigit((unsigned char)(*string))) /* Syntax error: a decimal number required. */
|
||||
{
|
||||
if (!isdigit((unsigned char)(*string))) /* Syntax error: */
|
||||
{ /* a decimal number required. */
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "decimal number expected");
|
||||
@@ -2105,9 +2117,9 @@ switch (*(++string))
|
||||
{
|
||||
if (!isxdigit(*string)) break;
|
||||
if (*string >= '0' && *string <= '9')
|
||||
c = c *16 + *string++ - '0';
|
||||
c = c *16 + (*string++ - '0');
|
||||
else
|
||||
c = c * 16 + (*string++ | 0x20) - 'a' + 10;
|
||||
c = c * 16 + ((*string++ | 0x20) - 'a') + 10;
|
||||
}
|
||||
*value = c;
|
||||
string--; /* Point to last digit */
|
||||
|
@@ -140,7 +140,7 @@ static const int eint2[] = {
|
||||
92, REG_INVARG, /* invalid option bits with PCRE2_LITERAL */
|
||||
98, REG_EESCAPE, /* missing digit after \0 in NO_BS0 mode */
|
||||
99, REG_EESCAPE, /* \K in lookaround */
|
||||
102, REG_EESCAPE /* \ddd octal > \377 in PYTHON_OCTAL mode */
|
||||
102, REG_EESCAPE /* \ddd octal > \377 in PYTHON_OCTAL mode */
|
||||
};
|
||||
|
||||
/* Table of texts corresponding to POSIX error codes */
|
||||
|
4
testdata/grepoutput
vendored
4
testdata/grepoutput
vendored
@@ -876,6 +876,10 @@ RC=0
|
||||
./testdata/grepinput:a binary zero:zeroa
|
||||
./testdata/grepinput:the binary zero.:zerothe.
|
||||
RC=0
|
||||
./testdata/grepinput:the binary zero.:zerothe.
|
||||
./testdata/grepinput:a binary zero:zeroa
|
||||
./testdata/grepinput:the binary zero.:zerothe.
|
||||
RC=0
|
||||
the binary zero.:
|
||||
|
||||
RC=0
|
||||
|
Reference in New Issue
Block a user