Files
pcre2/testdata/testinput12
2025-07-19 14:41:04 +01:00

728 lines
12 KiB
Plaintext
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This set of tests is for UTF-16 and UTF-32 support, including Unicode
# properties. It is relevant only to the 16-bit and 32-bit libraries. The
# output is different for each library, so there are separate output files.
/ÃÃÃxxx/IB,utf,no_utf_check
/abc/utf
Ã]
# Check maximum character size
/\x{ffff}/IB,utf
/\x{10000}/IB,utf
/\x{100}/IB,utf
/\x{1000}/IB,utf
/\x{10000}/IB,utf
/\x{100000}/IB,utf
/\x{10ffff}/IB,utf
/[\x{ff}]/IB,utf
/[\x{100}]/IB,utf
/\x80/IB,utf
/\xff/IB,utf
/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
\x{D55c}\x{ad6d}\x{C5B4}
/\x{65e5}\x{672c}\x{8a9e}/IB,utf
\x{65e5}\x{672c}\x{8a9e}
/\x{80}/IB,utf
/\x{084}/IB,utf
/\x{104}/IB,utf
/\x{861}/IB,utf
/\x{212ab}/IB,utf
/[^ab\xC0-\xF0]/IB,utf
\x{f1}
\x{bf}
\x{100}
\x{1000}
\= Expect no match
\x{c0}
\x{f0}
/(\x{100}+|x)/IB,utf
/(\x{100}*a|x)/IB,utf
/(\x{100}{0,2}a|x)/IB,utf
/(\x{100}{1,2}a|x)/IB,utf
/\x{100}/IB,utf
/a\x{100}\x{101}*/IB,utf
/a\x{100}\x{101}+/IB,utf
/[^\x{c4}]/IB
/[\x{100}]/IB,utf
\x{100}
Z\x{100}
\x{100}Z
/[\xff]/IB,utf
>\x{ff}<
/[^\xff]/IB,utf
/\x{100}abc(xyz(?1))/IB,utf
/\777/I,utf
\x{1ff}
\777
/\x{100}+\x{200}/IB,utf
/\x{100}+X/IB,utf
/^[\QÄ€\E-\QÅ<51>\E/B,utf
/X/utf
XX\x{d800}\=no_utf_check
XX\x{da00}\=no_utf_check
XX\x{dc00}\=no_utf_check
XX\x{de00}\=no_utf_check
XX\x{dfff}\=no_utf_check
\= Expect UTF error
XX\x{d800}
XX\x{da00}
XX\x{dc00}
XX\x{de00}
XX\x{dfff}
XX\x{110000}
XX\x{d800}\x{1234}
\= Expect no match
XX\x{d800}\=offset=3
/(?<=.)X/utf
XX\x{d800}\=offset=3
/(*UTF16)\x{11234}/
abcd\x{11234}pqr
/(*UTF)\x{11234}/I
abcd\x{11234}pqr
/(*UTF-32)\x{11234}/
abcd\x{11234}pqr
/(*UTF-32)\x{112}/
abcd\x{11234}pqr
/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
/(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I
/\h/I,utf
ABC\x{09}
ABC\x{20}
ABC\x{a0}
ABC\x{1680}
ABC\x{180e}
ABC\x{2000}
ABC\x{202f}
ABC\x{205f}
ABC\x{3000}
/\v/I,utf
ABC\x{0a}
ABC\x{0b}
ABC\x{0c}
ABC\x{0d}
ABC\x{85}
ABC\x{2028}
/\h*A/I,utf
CDBABC
\x{2000}ABC
/\R*A/I,bsr=unicode,utf
CDBABC
\x{2028}A
/\v+A/I,utf
/\s?xxx\s/I,utf
/\sxxx\s/I,utf,tables=2
AB\x{85}xxx\x{a0}XYZ
AB\x{a0}xxx\x{85}XYZ
/\S \S/I,utf,tables=2
\x{a2} \x{84}
A Z
/a+/utf
a\x{123}aa\=offset=1
a\x{123}aa\=offset=2
a\x{123}aa\=offset=3
\= Expect no match
a\x{123}aa\=offset=4
\= Expect bad offset error
a\x{123}aa\=offset=5
a\x{123}aa\=offset=6
/\x{1234}+/Ii,utf
/\x{1234}+?/Ii,utf
/\x{1234}++/Ii,utf
/\x{1234}{2}/Ii,utf
/[^\x{c4}]/IB,utf
/X+\x{200}/IB,utf
/\R/I,utf
# Check bad offset
/a/utf
\= Expect bad UTF-16 offset, or no match in 32-bit
\x{10000}\=offset=1
\x{10000}ab\=offset=1
\= Expect 16-bit match, 32-bit no match
\x{10000}ab\=offset=2
\= Expect no match
\x{10000}ab\=offset=3
\= Expect no match in 16-bit, bad offset in 32-bit
\x{10000}ab\=offset=4
\= Expect bad offset
\x{10000}ab\=offset=5
/í¼€/utf
/\w+\x{C4}/B,utf
a\x{C4}\x{C4}
/\w+\x{C4}/B,utf,tables=2
a\x{C4}\x{C4}
/\W+\x{C4}/B,utf
!\x{C4}
/\W+\x{C4}/B,utf,tables=2
!\x{C4}
/\W+\x{A1}/B,utf
!\x{A1}
/\W+\x{A1}/B,utf,tables=2
!\x{A1}
/X\s+\x{A0}/B,utf
X\x20\x{A0}\x{A0}
/X\s+\x{A0}/B,utf,tables=2
X\x20\x{A0}\x{A0}
/\S+\x{A0}/B,utf
X\x{A0}\x{A0}
/\S+\x{A0}/B,utf,tables=2
X\x{A0}\x{A0}
/\x{a0}+\s!/B,utf
\x{a0}\x20!
/\x{a0}+\s!/B,utf,tables=2
\x{a0}\x20!
/(*UTF)abc/never_utf
/abc/utf,never_utf
/(*UCP)abc/never_ucp
/abc/ucp,never_ucp
/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
/AB\x{1fb0}/IB,utf
/AB\x{1fb0}/IBi,utf
/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
\x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
/[â±¥]/Bi,utf
/[^â±¥]/Bi,utf
/[[:blank:]]/B,ucp
/\x{212a}+/Ii,utf
KKkk\x{212a}
/s+/Ii,utf
SSss\x{17f}
# Non-UTF characters should give errors in both 16-bit and 32-bit modes.
/\x{110000}/utf
/\o{4200000}/utf
/\x{100}*A/IB,utf
A
/\x{100}*\d(?R)/IB,utf
/[Z\x{100}]/IB,utf
Z\x{100}
\x{100}
\x{100}Z
/[z-\x{100}]/IB,utf
/[z\Qa-d]Ä€\E]/IB,utf
\x{100}
Ä€
/[ab\x{100}]abc(xyz(?1))/IB,utf
/\x{100}*\s/IB,utf
/\x{100}*\d/IB,utf
/\x{100}*\w/IB,utf
/\x{100}*\D/IB,utf
/\x{100}*\S/IB,utf
/\x{100}*\W/IB,utf
/[\x{105}-\x{109}]/IBi,utf
\x{104}
\x{105}
\x{109}
\= Expect no match
\x{100}
\x{10a}
/[z-\x{100}]/IBi,utf
Z
z
\x{39c}
\x{178}
|
\x{80}
\x{ff}
\x{100}
\x{101}
\= Expect no match
\x{102}
Y
y
/[z-\x{100}]/IBi,utf
/\x{3a3}B/IBi,utf
/./utf
\x{110000}
/(*UTF)abý¿¿¿¿¿z/B
/abý¿¿¿¿¿z/utf
/[\W\p{Any}]/B
abc
123
/[\W\pL]/B
abc
\x{100}
\x{308}
\= Expect no match
123
/[\s[:^ascii:]]/B,ucp
/\pP/ucp
\x{7fffffff}
/\p{ Aሴ}/utf
/\p{BC: Aሴ}/utf
# A special extra option allows excaped surrogate code points in 32-bit mode,
# but subjects containing them must not be UTF-checked. These patterns give
# errors in 16-bit mode.
/\x{d800}/I,utf,allow_surrogate_escapes
\x{d800}\=no_utf_check
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
\x{dfff}\x{df01}\=no_utf_check
# This has different starting code units in 8-bit mode.
/^[^ab]/IB,utf
c
\x{ff}
\x{100}
\= Expect no match
aaa
# Offsets are different in 8-bit mode.
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
123abcáyzabcdef789abcሴqr
# A few script run tests in non-UTF mode (but they need Unicode support)
/^(*script_run:.{4})/
\x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
\x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
\x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
/^(*sr:.*)/utf,allow_surrogate_escapes
\x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
\x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
/(?(n/utf
/(?(á/utf
/^\cģ/utf
/(?'Ù ABC'...)/utf
# Invalid UTF-16/32 tests.
/.../g,match_invalid_utf
abcd\x{df00}wxzy\x{df00}pqrs
abcd\x{80}wxzy\x{df00}pqrs
/abc/match_invalid_utf
ab\x{df00}ab\=ph
\= Expect no match
ab\x{df00}cdef\=ph
/.a/match_invalid_utf
ab\=ph
ab\=ps
\= Expect no match
b\x{df00}\=ph
b\x{df00}\=ps
/.a$/match_invalid_utf
ab\=ph
ab\=ps
\= Expect no match
b\x{df00}\=ph
b\x{df00}\=ps
/ab$/match_invalid_utf
ab\x{df00}cdeab
\= Expect no match
ab\x{df00}cde
/.../g,match_invalid_utf
abcd\x{80}wxzy\x{df00}pqrs
/(?<=x)../g,match_invalid_utf
abcd\x{80}wxzy\x{df00}pqrs
abcd\x{80}wxzy\x{df00}xpqrs
/X$/match_invalid_utf
\= Expect no match
X\x{df00}
/(?<=..)X/match_invalid_utf,aftertext
AB\x{df00}AQXYZ
AB\x{df00}AQXYZ\=offset=5
AB\x{df00}\x{df00}AXYZXC\=offset=5
\= Expect no match
AB\x{df00}XYZ
AB\x{df00}XYZ\=offset=3
AB\x{df00}AXYZ
AB\x{df00}AXYZ\=offset=4
AB\x{df00}\x{df00}AXYZ\=offset=5
/.../match_invalid_utf
\= Expect no match
A\x{d800}B
A\x{110000}B
/aa/utf,ucp,match_invalid_utf,global
aa\x{d800}aa
/aa/utf,ucp,match_invalid_utf,global
\x{d800}aa
/A\z/utf,match_invalid_utf
A\x{df00}\n
/ab$/match_invalid_utf
\= Expect no match
ab\x{df00}cde
/ab\z/match_invalid_utf
\= Expect no match
ab\x{df00}cde
/ab\Z/match_invalid_utf
\= Expect no match
ab\x{df00}cde
/(..)(*scs:(1)ab\z)/match_invalid_utf
ab\x{df00}cde
/(..)(*scs:(1)ab\Z)/match_invalid_utf
ab\x{df00}cde
/(..)(*scs:(1)ab$)/match_invalid_utf
ab\x{df00}cde
# ----------------------------------------------------
/(*UTF)(?=\x{123})/I
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
/[\xff\x{ffff}]/I,utf
/[\xff\x{ff}]/I,utf
/[\xff\x{ff}]/I
/[Ss]/I
/[Ss]/I,utf
/(?:\x{ff}|\x{3000})/I,utf
# ----------------------------------------------------
# UCP and casing tests
/\x{120}/iI
/\x{c1}/iI,ucp
/[\x{120}\x{121}]/iB,ucp
/[ab\x{120}]+/iB,ucp
aABb\x{121}\x{120}
/\x{c1}/i,no_start_optimize
\= Expect no match
\x{e1}
/\x{120}\x{c1}/i,ucp,no_start_optimize
\x{121}\x{e1}
/\x{120}\x{c1}/i,ucp
\x{121}\x{e1}
/[^\x{120}]/i,no_start_optimize
\x{121}
/[^\x{120}]/i,ucp,no_start_optimize
\= Expect no match
\x{121}
/[^\x{120}]/i
\x{121}
/[^\x{120}]/i,ucp
\= Expect no match
\x{121}
/\x{120}{2}/i,ucp
\x{121}\x{121}
/[^\x{120}]{2}/i,ucp
\= Expect no match
\x{121}\x{121}
/\x{c1}+\x{e1}/iB,ucp
\x{c1}\x{c1}\x{c1}
/\x{c1}+\x{e1}/iIB,ucp
\x{c1}\x{c1}\x{c1}
\x{e1}\x{e1}\x{e1}
/a|\x{c1}/iI,ucp
\x{e1}xxx
/\x{c1}|\x{e1}/iI,ucp
/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
X\x{e1}Y
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
/s/i,ucp
\x{17f}
/s/i,utf
\x{17f}
/[^s]/i,ucp
\= Expect no match
\x{17f}
/[^s]/i,utf
\= Expect no match
\x{17f}
/(.) \1/i,ucp
i I
/(.) \1/i,ucp,turkish_casing
\= Expect no match
i I
/(.) \1/i,ucp
i I
\x{212a} k
\= Expect no match
i \x{0130}
\x{0131} I
/(.) \1/i,ucp,turkish_casing
\x{212a} k
i \x{0130}
\x{0131} I
\= Expect no match
i I
/(.) (?r:\1)/i,ucp,turkish_casing
i I
\= Expect no match
i \x{0130}
\x{0131} I
\x{212a} k
/[a-z][^i]I/ucp,turkish_casing
bII
b\x{0130}I
b\x{0131}I
\= Expect no match
biI
/[a-z][^i]I/i,ucp,turkish_casing
b\x{0131}I
bII
\= Expect no match
biI
b\x{0130}I
/[a-z](?r:[^i])I/i,ucp,turkish_casing
b\x{0131}I
b\x{0130}I
\= Expect no match
bII
biI
/b(?r:[\x{00FF}-\x{FFEE}])/i,ucp,turkish_casing
b\x{0130}
b\x{0131}
B\x{212a}
\= Expect no match
bi
bI
bk
/[\x60-\x7f]/i,ucp,turkish_casing
i
\= Expect no match
I
/[\x60-\xc0]/i,ucp,turkish_casing
i
\= Expect no match
I
/[\x80-\xc0]/i,ucp,turkish_casing
\= Expect no match
i
I
# ----------------------------------------------------
/b[\x{00FF}-\x{FFEE}]/ir
b\x{0130}
b\x{0131}
B\x{212a}
\= Expect no match
bi
bI
bk
# Quantifier after a literal that has the value of META_ACCEPT (not UTF). This
# fails in 16-bit mode, but is OK for 32-bit.
/\x{802a0000}*/
\x{802a0000}\x{802a0000}
# UTF matching without UTF, check invalid UTF characters
/\X++/
a\x{110000}\x{ffffffff}
# This used to loop in 32-bit mode; it will fail in 16-bit mode.
/[\x{ffffffff}]/caseless,ucp
\x{ffffffff}xyz
# These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
# will give errors in 16-bit mode.
/k*\x{ffffffff}/caseless,ucp
\x{ffffffff}
/k+\x{ffffffff}/caseless,ucp,no_start_optimize
K\x{ffffffff}
\= Expect no match
\x{ffffffff}\x{ffffffff}
/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
\= Expect no match
\x{ffffffff}\x{ffffffff}\x{ffffffff}
/k\x{ffffffff}/caseless,ucp,no_start_optimize
K\x{ffffffff}
\= Expect no match
\x{ffffffff}\x{ffffffff}\x{ffffffff}
/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
\= Expect no match
Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
/[sk](?r:[sk])[sk]/Bi,ucp
SKS
sks
\x{212a}S\x{17f}
\x{17f}K\x{212a}
\= Expect no match
s\x{212a}s
K\x{17f}K
# ---------------------------------------------------------
# End of testinput12