mirror of
https://github.com/PCRE2Project/pcre2.git
synced 2025-10-17 07:04:13 +08:00

UCD 16 makes a lot of changes to scripts, so make sure that we have sufficient coverage by keeping the original autogenerated tests in addition. Complete the code updates for changes to ScriptExtensions.txt which is no longer sorted by script and allow for multiple unicode property test files, depending on Unicode version.
187 lines
5.5 KiB
Python
Executable File
187 lines
5.5 KiB
Python
Executable File
#! /usr/bin/python
|
|
|
|
# PCRE2 UNICODE PROPERTY SUPPORT
|
|
# ------------------------------
|
|
#
|
|
# This file auto-generates Unicode property tests and their expected output.
|
|
# It is recommended to re-run this generator after the Unicode files are
|
|
# updated. The names of the generated files are `testinput` and `testoutput`
|
|
# and should be copied over to replace either test26 or test27 files.
|
|
|
|
import re
|
|
import sys
|
|
|
|
from GenerateCommon import \
|
|
script_names, \
|
|
script_abbrevs
|
|
|
|
def write_both(text):
|
|
input_file.write(text)
|
|
output_file.write(text)
|
|
|
|
def to_string_char(ch_idx):
|
|
if ch_idx < 128:
|
|
if ch_idx < 16:
|
|
return "\\x{0%x}" % ch_idx
|
|
if ch_idx >= 32:
|
|
return chr(ch_idx)
|
|
return "\\x{%x}" % ch_idx
|
|
|
|
try:
|
|
input_file = open("testinput", "w")
|
|
output_file = open("testoutput", "w")
|
|
except IOError:
|
|
print("** Couldn't create output files")
|
|
sys.exit(1)
|
|
|
|
write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n");
|
|
write_both("# data, do not edit unless that data has changed and they are reflecting\n");
|
|
write_both("# a previous version.\n\n");
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# UNICODE SCRIPT EXTENSION TESTS
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def gen_script_tests():
|
|
script_data = [None] * len(script_names)
|
|
char_data = [None] * 0x110000
|
|
|
|
property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#")
|
|
prev_name = ""
|
|
script_idx = -1
|
|
|
|
with open("Unicode.tables/Scripts.txt") as f:
|
|
version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$"
|
|
v = re.match(version_pat, f.readline())
|
|
unicode_version = v.group(1)
|
|
|
|
write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n")
|
|
write_both("#perltest\n\n")
|
|
|
|
for line in f:
|
|
match_obj = property_re.match(line)
|
|
|
|
if match_obj == None:
|
|
continue
|
|
|
|
name = match_obj.group(3)
|
|
if name != prev_name:
|
|
script_idx = script_names.index(name)
|
|
prev_name = name
|
|
|
|
low = int(match_obj.group(1), 16)
|
|
high = low
|
|
char_data[low] = name
|
|
|
|
if match_obj.group(2) != None:
|
|
high = int(match_obj.group(2), 16)
|
|
for idx in range(low + 1, high + 1):
|
|
char_data[idx] = name
|
|
|
|
if script_data[script_idx] == None:
|
|
script_data[script_idx] = [low, None, None, None, None]
|
|
script_data[script_idx][1] = high
|
|
|
|
extended_script_indicies = {}
|
|
|
|
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
|
for line in f:
|
|
match_obj = property_re.match(line)
|
|
|
|
if match_obj == None:
|
|
continue
|
|
|
|
low = int(match_obj.group(1), 16)
|
|
high = low
|
|
if match_obj.group(2) != None:
|
|
high = int(match_obj.group(2), 16)
|
|
|
|
for abbrev in match_obj.group(3).split(" "):
|
|
if abbrev not in extended_script_indicies:
|
|
idx = script_abbrevs.index(abbrev)
|
|
extended_script_indicies[abbrev] = idx
|
|
rec = script_data[idx]
|
|
rec[2] = low
|
|
rec[3] = high
|
|
else:
|
|
idx = extended_script_indicies[abbrev]
|
|
rec = script_data[idx]
|
|
if rec[2] > low:
|
|
rec[2] = low
|
|
if rec[3] < high:
|
|
rec[3] = high
|
|
|
|
if rec[4] == None:
|
|
name = script_names[idx]
|
|
for idx in range(low, high + 1):
|
|
if char_data[idx] != name:
|
|
rec[4] = idx
|
|
break
|
|
|
|
long_property_name = False
|
|
|
|
for idx, rec in enumerate(script_data):
|
|
script_name = script_names[idx]
|
|
|
|
if script_name == "Unknown":
|
|
continue
|
|
|
|
script_abbrev = script_abbrevs[idx]
|
|
|
|
write_both("# Base script check\n")
|
|
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
|
write_both(" %s\n" % to_string_char(rec[0]))
|
|
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
|
write_both("\n")
|
|
|
|
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
|
write_both(" %s\n" % to_string_char(rec[1]))
|
|
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
|
write_both("\n")
|
|
|
|
if rec[2] != None:
|
|
property_name = "scx"
|
|
if long_property_name:
|
|
property_name = "Script_Extensions"
|
|
|
|
write_both("# Script extension check\n")
|
|
write_both("/^\\p{%s}/utf\n" % script_name)
|
|
write_both(" %s\n" % to_string_char(rec[2]))
|
|
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
|
write_both("\n")
|
|
|
|
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
|
write_both(" %s\n" % to_string_char(rec[3]))
|
|
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
|
write_both("\n")
|
|
|
|
long_property_name = not long_property_name
|
|
|
|
if rec[4] != None:
|
|
write_both("# Script extension only character\n")
|
|
write_both("/^\\p{%s}/utf\n" % script_name)
|
|
write_both(" %s\n" % to_string_char(rec[4]))
|
|
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
|
write_both("\n")
|
|
|
|
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
|
write_both(" %s\n" % to_string_char(rec[4]))
|
|
output_file.write("No match\n")
|
|
write_both("\n")
|
|
else:
|
|
print("External character has not found for %s" % script_name)
|
|
|
|
high = rec[1]
|
|
if rec[3] != None and rec[3] > rec[1]:
|
|
high = rec[3]
|
|
write_both("# Character not in script\n")
|
|
write_both("/^\\p{%s}/utf\n" % script_name)
|
|
write_both(" %s\n" % to_string_char(high + 1))
|
|
output_file.write("No match\n")
|
|
write_both("\n")
|
|
|
|
gen_script_tests()
|
|
|
|
write_both("# End of test\n")
|