#! /usr/bin/env python3 # PCRE2 UNICODE PROPERTY SUPPORT # ------------------------------ # # This file auto-generates Unicode property tests and their expected output. # It is recommended to re-run this generator after the Unicode files are # updated. The names of the generated files are `testinput` and `testoutput` # and should be copied over to replace either test26 or test27 files. import re import sys from GenerateCommon import \ script_names, \ script_abbrevs def write_both(text): input_file.write(text) output_file.write(text) def to_string_char(ch_idx): if ch_idx < 128: if ch_idx < 16: return "\\x{0%x}" % ch_idx if ch_idx >= 32: return chr(ch_idx) return "\\x{%x}" % ch_idx try: input_file = open("testinput", "w") output_file = open("testoutput", "w") except IOError: print("** Couldn't create output files") sys.exit(1) write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n"); write_both("# data, do not edit unless that data has changed and they are reflecting\n"); write_both("# a previous version.\n\n"); # --------------------------------------------------------------------------- # UNICODE SCRIPT EXTENSION TESTS # --------------------------------------------------------------------------- def gen_script_tests(): script_data = [None] * len(script_names) char_data = [None] * 0x110000 property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#") prev_name = "" script_idx = -1 with open("Unicode.tables/Scripts.txt") as f: version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$" v = re.match(version_pat, f.readline()) unicode_version = v.group(1) write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n") write_both("#perltest\n\n") for line in f: match_obj = property_re.match(line) if match_obj == None: continue name = match_obj.group(3) if name != prev_name: script_idx = script_names.index(name) prev_name = name low = int(match_obj.group(1), 16) high = low char_data[low] = name if match_obj.group(2) != None: high = int(match_obj.group(2), 16) for idx in range(low + 1, high + 1): char_data[idx] = name if script_data[script_idx] == None: script_data[script_idx] = [low, None, None, None, None] script_data[script_idx][1] = high extended_script_indicies = {} with open("Unicode.tables/ScriptExtensions.txt") as f: for line in f: match_obj = property_re.match(line) if match_obj == None: continue low = int(match_obj.group(1), 16) high = low if match_obj.group(2) != None: high = int(match_obj.group(2), 16) for abbrev in match_obj.group(3).split(" "): if abbrev not in extended_script_indicies: idx = script_abbrevs.index(abbrev) extended_script_indicies[abbrev] = idx rec = script_data[idx] rec[2] = low rec[3] = high else: idx = extended_script_indicies[abbrev] rec = script_data[idx] if rec[2] > low: rec[2] = low if rec[3] < high: rec[3] = high if rec[4] == None: name = script_names[idx] for idx in range(low, high + 1): if char_data[idx] != name: rec[4] = idx break long_property_name = False for idx, rec in enumerate(script_data): script_name = script_names[idx] if script_name == "Unknown": continue script_abbrev = script_abbrevs[idx] write_both("# Base script check\n") write_both("/^\\p{sc=%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[0])) output_file.write(" 0: %s\n" % to_string_char(rec[0])) write_both("\n") write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) write_both(" %s\n" % to_string_char(rec[1])) output_file.write(" 0: %s\n" % to_string_char(rec[1])) write_both("\n") if rec[2] != None: property_name = "scx" if long_property_name: property_name = "Script_Extensions" write_both("# Script extension check\n") write_both("/^\\p{%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[2])) output_file.write(" 0: %s\n" % to_string_char(rec[2])) write_both("\n") write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) write_both(" %s\n" % to_string_char(rec[3])) output_file.write(" 0: %s\n" % to_string_char(rec[3])) write_both("\n") long_property_name = not long_property_name if rec[4] != None: write_both("# Script extension only character\n") write_both("/^\\p{%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[4])) output_file.write(" 0: %s\n" % to_string_char(rec[4])) write_both("\n") write_both("/^\\p{sc=%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[4])) output_file.write("No match\n") write_both("\n") else: print("External character has not found for %s" % script_name) high = rec[1] if rec[3] != None and rec[3] > rec[1]: high = rec[3] write_both("# Character not in script\n") write_both("/^\\p{%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(high + 1)) output_file.write("No match\n") write_both("\n") gen_script_tests() write_both("# End of test\n")