mirror of
https://github.com/google/re2.git
synced 2025-10-19 11:23:17 +08:00

... and make sure it stays. Change-Id: Iab9f9e9445545a9c97aefd4d83691a70fde2b1c8 Reviewed-on: https://code-review.googlesource.com/c/re2/+/63111 Reviewed-by: Paul Wankadia <junyer@google.com> Reviewed-by: Alex Chernyakhovsky <achernya@google.com>
353 lines
12 KiB
C++
353 lines
12 KiB
C++
// Copyright 2019 The RE2 Authors. All Rights Reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include <stddef.h>
|
|
#include <sys/types.h>
|
|
|
|
#include <memory>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "absl/strings/string_view.h"
|
|
#include "pybind11/buffer_info.h"
|
|
#include "pybind11/gil.h"
|
|
#include "pybind11/pybind11.h"
|
|
#include "pybind11/pytypes.h"
|
|
#include "pybind11/stl.h" // IWYU pragma: keep
|
|
#include "re2/filtered_re2.h"
|
|
#include "re2/re2.h"
|
|
#include "re2/set.h"
|
|
|
|
#ifdef _WIN32
|
|
#include <basetsd.h>
|
|
#define ssize_t SSIZE_T
|
|
#endif
|
|
|
|
namespace re2_python {
|
|
|
|
// This is conventional.
|
|
namespace py = pybind11;
|
|
|
|
// In terms of the pybind11 API, a py::buffer is merely a py::object that
|
|
// supports the buffer interface/protocol and you must explicitly request
|
|
// a py::buffer_info in order to access the actual bytes. Under the hood,
|
|
// the py::buffer_info manages a reference count to the py::buffer, so it
|
|
// must be constructed and subsequently destructed while holding the GIL.
|
|
static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
|
|
char* data = reinterpret_cast<char*>(bytes.ptr);
|
|
ssize_t size = bytes.size;
|
|
return absl::string_view(data, size);
|
|
}
|
|
|
|
static inline int OneCharLen(const char* ptr) {
|
|
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
|
|
}
|
|
|
|
// Helper function for when Python encodes str to bytes and then needs to
|
|
// convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
|
|
ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
|
|
auto bytes = buffer.request();
|
|
auto text = FromBytes(bytes);
|
|
auto ptr = text.data() + pos;
|
|
auto end = text.data() + text.size();
|
|
while (ptr < end && len > 0) {
|
|
ptr += OneCharLen(ptr);
|
|
--len;
|
|
}
|
|
return ptr - (text.data() + pos);
|
|
}
|
|
|
|
// Helper function for when Python decodes bytes to str and then needs to
|
|
// convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
|
|
ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
|
|
auto bytes = buffer.request();
|
|
auto text = FromBytes(bytes);
|
|
auto ptr = text.data() + pos;
|
|
auto end = text.data() + endpos;
|
|
ssize_t len = 0;
|
|
while (ptr < end) {
|
|
ptr += OneCharLen(ptr);
|
|
++len;
|
|
}
|
|
return len;
|
|
}
|
|
|
|
std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
|
|
const RE2::Options& options) {
|
|
auto bytes = buffer.request();
|
|
auto pattern = FromBytes(bytes);
|
|
return std::make_unique<RE2>(pattern, options);
|
|
}
|
|
|
|
py::bytes RE2ErrorShim(const RE2& self) {
|
|
// Return std::string as bytes. That is, without decoding to str.
|
|
return self.error();
|
|
}
|
|
|
|
std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
|
|
const RE2& self) {
|
|
const int num_groups = self.NumberOfCapturingGroups();
|
|
std::vector<std::pair<py::bytes, int>> groups;
|
|
groups.reserve(num_groups);
|
|
for (const auto& it : self.NamedCapturingGroups()) {
|
|
groups.emplace_back(it.first, it.second);
|
|
}
|
|
return groups;
|
|
}
|
|
|
|
std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
|
|
std::vector<int> histogram;
|
|
self.ProgramFanout(&histogram);
|
|
return histogram;
|
|
}
|
|
|
|
std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
|
|
std::vector<int> histogram;
|
|
self.ReverseProgramFanout(&histogram);
|
|
return histogram;
|
|
}
|
|
|
|
std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
|
|
const RE2& self, int maxlen) {
|
|
std::string min, max;
|
|
// Return std::string as bytes. That is, without decoding to str.
|
|
return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
|
|
}
|
|
|
|
std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
|
|
RE2::Anchor anchor,
|
|
py::buffer buffer,
|
|
ssize_t pos,
|
|
ssize_t endpos) {
|
|
auto bytes = buffer.request();
|
|
auto text = FromBytes(bytes);
|
|
const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
|
|
std::vector<absl::string_view> groups;
|
|
groups.resize(num_groups);
|
|
py::gil_scoped_release release_gil;
|
|
if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
|
|
// Ensure that groups are null before converting to spans!
|
|
for (auto& it : groups) {
|
|
it = absl::string_view();
|
|
}
|
|
}
|
|
std::vector<std::pair<ssize_t, ssize_t>> spans;
|
|
spans.reserve(num_groups);
|
|
for (const auto& it : groups) {
|
|
if (it.data() == NULL) {
|
|
spans.emplace_back(-1, -1);
|
|
} else {
|
|
spans.emplace_back(it.data() - text.data(),
|
|
it.data() - text.data() + it.size());
|
|
}
|
|
}
|
|
return spans;
|
|
}
|
|
|
|
py::bytes RE2QuoteMetaShim(py::buffer buffer) {
|
|
auto bytes = buffer.request();
|
|
auto pattern = FromBytes(bytes);
|
|
// Return std::string as bytes. That is, without decoding to str.
|
|
return RE2::QuoteMeta(pattern);
|
|
}
|
|
|
|
class Set {
|
|
public:
|
|
Set(RE2::Anchor anchor, const RE2::Options& options)
|
|
: set_(options, anchor) {}
|
|
|
|
~Set() = default;
|
|
|
|
// Not copyable or movable.
|
|
Set(const Set&) = delete;
|
|
Set& operator=(const Set&) = delete;
|
|
|
|
int Add(py::buffer buffer) {
|
|
auto bytes = buffer.request();
|
|
auto pattern = FromBytes(bytes);
|
|
int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
|
|
return index;
|
|
}
|
|
|
|
bool Compile() {
|
|
// Compiling can fail.
|
|
return set_.Compile();
|
|
}
|
|
|
|
std::vector<int> Match(py::buffer buffer) const {
|
|
auto bytes = buffer.request();
|
|
auto text = FromBytes(bytes);
|
|
std::vector<int> matches;
|
|
py::gil_scoped_release release_gil;
|
|
set_.Match(text, &matches);
|
|
return matches;
|
|
}
|
|
|
|
private:
|
|
RE2::Set set_;
|
|
};
|
|
|
|
class Filter {
|
|
public:
|
|
Filter() = default;
|
|
~Filter() = default;
|
|
|
|
// Not copyable or movable.
|
|
Filter(const Filter&) = delete;
|
|
Filter& operator=(const Filter&) = delete;
|
|
|
|
int Add(py::buffer buffer, const RE2::Options& options) {
|
|
auto bytes = buffer.request();
|
|
auto pattern = FromBytes(bytes);
|
|
int index = -1; // not clobbered on error
|
|
filter_.Add(pattern, options, &index);
|
|
return index;
|
|
}
|
|
|
|
bool Compile() {
|
|
std::vector<std::string> atoms;
|
|
filter_.Compile(&atoms);
|
|
RE2::Options options;
|
|
options.set_literal(true);
|
|
options.set_case_sensitive(false);
|
|
set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
|
|
for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
|
|
if (set_->Add(atoms[i], /*error=*/NULL) != i) {
|
|
// Should never happen: the atom is a literal!
|
|
py::pybind11_fail("set_->Add() failed");
|
|
}
|
|
}
|
|
// Compiling can fail.
|
|
return set_->Compile();
|
|
}
|
|
|
|
std::vector<int> Match(py::buffer buffer, bool potential) const {
|
|
if (set_ == nullptr) {
|
|
py::pybind11_fail("Match() called before compiling");
|
|
}
|
|
|
|
auto bytes = buffer.request();
|
|
auto text = FromBytes(bytes);
|
|
std::vector<int> atoms;
|
|
py::gil_scoped_release release_gil;
|
|
set_->Match(text, &atoms);
|
|
std::vector<int> matches;
|
|
if (potential) {
|
|
filter_.AllPotentials(atoms, &matches);
|
|
} else {
|
|
filter_.AllMatches(text, atoms, &matches);
|
|
}
|
|
return matches;
|
|
}
|
|
|
|
const RE2& GetRE2(int index) const {
|
|
return filter_.GetRE2(index);
|
|
}
|
|
|
|
private:
|
|
re2::FilteredRE2 filter_;
|
|
std::unique_ptr<RE2::Set> set_;
|
|
};
|
|
|
|
PYBIND11_MODULE(_re2, module) {
|
|
// Translate exceptions thrown by py::pybind11_fail() into Python.
|
|
py::register_local_exception<std::runtime_error>(module, "Error");
|
|
|
|
module.def("CharLenToBytes", &CharLenToBytes);
|
|
module.def("BytesToCharLen", &BytesToCharLen);
|
|
|
|
// CLASSES
|
|
// class RE2
|
|
// enum Anchor
|
|
// class Options
|
|
// enum Encoding
|
|
// class Set
|
|
// class Filter
|
|
py::class_<RE2> re2(module, "RE2");
|
|
py::enum_<RE2::Anchor> anchor(re2, "Anchor");
|
|
py::class_<RE2::Options> options(re2, "Options");
|
|
py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
|
|
py::class_<Set> set(module, "Set");
|
|
py::class_<Filter> filter(module, "Filter");
|
|
|
|
anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
|
|
anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
|
|
anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
|
|
|
|
encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
|
|
encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
|
|
|
|
options.def(py::init<>())
|
|
.def_property("max_mem", //
|
|
&RE2::Options::max_mem, //
|
|
&RE2::Options::set_max_mem) //
|
|
.def_property("encoding", //
|
|
&RE2::Options::encoding, //
|
|
&RE2::Options::set_encoding) //
|
|
.def_property("posix_syntax", //
|
|
&RE2::Options::posix_syntax, //
|
|
&RE2::Options::set_posix_syntax) //
|
|
.def_property("longest_match", //
|
|
&RE2::Options::longest_match, //
|
|
&RE2::Options::set_longest_match) //
|
|
.def_property("log_errors", //
|
|
&RE2::Options::log_errors, //
|
|
&RE2::Options::set_log_errors) //
|
|
.def_property("literal", //
|
|
&RE2::Options::literal, //
|
|
&RE2::Options::set_literal) //
|
|
.def_property("never_nl", //
|
|
&RE2::Options::never_nl, //
|
|
&RE2::Options::set_never_nl) //
|
|
.def_property("dot_nl", //
|
|
&RE2::Options::dot_nl, //
|
|
&RE2::Options::set_dot_nl) //
|
|
.def_property("never_capture", //
|
|
&RE2::Options::never_capture, //
|
|
&RE2::Options::set_never_capture) //
|
|
.def_property("case_sensitive", //
|
|
&RE2::Options::case_sensitive, //
|
|
&RE2::Options::set_case_sensitive) //
|
|
.def_property("perl_classes", //
|
|
&RE2::Options::perl_classes, //
|
|
&RE2::Options::set_perl_classes) //
|
|
.def_property("word_boundary", //
|
|
&RE2::Options::word_boundary, //
|
|
&RE2::Options::set_word_boundary) //
|
|
.def_property("one_line", //
|
|
&RE2::Options::one_line, //
|
|
&RE2::Options::set_one_line); //
|
|
|
|
re2.def(py::init(&RE2InitShim))
|
|
.def("ok", &RE2::ok)
|
|
.def("error", &RE2ErrorShim)
|
|
.def("options", &RE2::options)
|
|
.def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
|
|
.def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
|
|
.def("ProgramSize", &RE2::ProgramSize)
|
|
.def("ReverseProgramSize", &RE2::ReverseProgramSize)
|
|
.def("ProgramFanout", &RE2ProgramFanoutShim)
|
|
.def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
|
|
.def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
|
|
.def("Match", &RE2MatchShim)
|
|
.def_static("QuoteMeta", &RE2QuoteMetaShim);
|
|
|
|
set.def(py::init<RE2::Anchor, const RE2::Options&>())
|
|
.def("Add", &Set::Add)
|
|
.def("Compile", &Set::Compile)
|
|
.def("Match", &Set::Match);
|
|
|
|
filter.def(py::init<>())
|
|
.def("Add", &Filter::Add)
|
|
.def("Compile", &Filter::Compile)
|
|
.def("Match", &Filter::Match)
|
|
.def("GetRE2", &Filter::GetRE2,
|
|
py::return_value_policy::reference_internal);
|
|
}
|
|
|
|
} // namespace re2_python
|