pcre2/maint/GenerateTest.py

#! /usr/bin/env python3

#                   PCRE2 UNICODE PROPERTY SUPPORT
#                   ------------------------------
#
# This file auto-generates Unicode property tests and their expected output.
# It is recommended to re-run this generator after the Unicode files are
# updated. The names of the generated files are `testinput` and `testoutput`
# and should be copied over to replace either test26 or test27 files.

import re
import sys

from GenerateCommon import \
  script_names, \
  script_abbrevs

def write_both(text):
  input_file.write(text)
  output_file.write(text)

def to_string_char(ch_idx):
  if ch_idx < 128:
    if ch_idx < 16:
      return "\\x{0%x}" % ch_idx
    if ch_idx >= 32:
      return chr(ch_idx)
  return "\\x{%x}" % ch_idx

try:
  input_file = open("testinput", "w")
  output_file = open("testoutput", "w")
except IOError:
  print("** Couldn't create output files")
  sys.exit(1)

write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n");
write_both("# data, do not edit unless that data has changed and they are reflecting\n");
write_both("# a previous version.\n\n");

# ---------------------------------------------------------------------------
#                      UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------


def gen_script_tests():
  script_data = [None] * len(script_names)
  char_data = [None] * 0x110000

  property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#")
  prev_name = ""
  script_idx = -1

  with open("Unicode.tables/Scripts.txt") as f:
    version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$"
    v = re.match(version_pat, f.readline())
    unicode_version = v.group(1)

    write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n")
    write_both("#perltest\n\n")

    for line in f:
      match_obj = property_re.match(line)

      if match_obj == None:
        continue

      name = match_obj.group(3)
      if name != prev_name:
        script_idx = script_names.index(name)
        prev_name = name

      low = int(match_obj.group(1), 16)
      high = low
      char_data[low] = name

      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)
        for idx in range(low + 1, high + 1):
           char_data[idx] = name

      if script_data[script_idx] == None:
        script_data[script_idx] = [low, None, None, None, None]
      script_data[script_idx][1] = high

  extended_script_indicies = {}

  with open("Unicode.tables/ScriptExtensions.txt") as f:
    for line in f:
      match_obj = property_re.match(line)

      if match_obj == None:
        continue

      low = int(match_obj.group(1), 16)
      high = low
      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)

      for abbrev in match_obj.group(3).split(" "):
        if abbrev not in extended_script_indicies:
          idx = script_abbrevs.index(abbrev)
          extended_script_indicies[abbrev] = idx
          rec = script_data[idx]
          rec[2] = low
          rec[3] = high
        else:
          idx = extended_script_indicies[abbrev]
          rec = script_data[idx]
          if rec[2] > low:
            rec[2] = low
          if rec[3] < high:
            rec[3] = high

        if rec[4] == None:
          name = script_names[idx]
          for idx in range(low, high + 1):
            if char_data[idx] != name:
              rec[4] = idx
              break

  long_property_name = False

  for idx, rec in enumerate(script_data):
    script_name = script_names[idx]

    if script_name == "Unknown":
      continue

    script_abbrev = script_abbrevs[idx]

    write_both("# Base script check\n")
    write_both("/^\\p{sc=%s}/utf\n" % script_name)
    write_both("    %s\n" % to_string_char(rec[0]))
    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
    write_both("\n")

    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
    write_both("    %s\n" % to_string_char(rec[1]))
    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
    write_both("\n")

    if rec[2] != None:
      property_name = "scx"
      if long_property_name:
        property_name = "Script_Extensions"

      write_both("# Script extension check\n")
      write_both("/^\\p{%s}/utf\n" % script_name)
      write_both("    %s\n" % to_string_char(rec[2]))
      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
      write_both("\n")

      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
      write_both("    %s\n" % to_string_char(rec[3]))
      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
      write_both("\n")

      long_property_name = not long_property_name

      if rec[4] != None:
        write_both("# Script extension only character\n")
        write_both("/^\\p{%s}/utf\n" % script_name)
        write_both("    %s\n" % to_string_char(rec[4]))
        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
        write_both("\n")

        write_both("/^\\p{sc=%s}/utf\n" % script_name)
        write_both("    %s\n" % to_string_char(rec[4]))
        output_file.write("No match\n")
        write_both("\n")
      else:
        print("External character has not found for %s" % script_name)

    high = rec[1]
    if rec[3] != None and rec[3] > rec[1]:
      high = rec[3]
    write_both("# Character not in script\n")
    write_both("/^\\p{%s}/utf\n" % script_name)
    write_both("    %s\n" % to_string_char(high + 1))
    output_file.write("No match\n")
    write_both("\n")

gen_script_tests()

write_both("# End of test\n")