add ucs2.diff, showing old UCS-2 support

R=rsc CC=re2-dev http://codereview.appspot.com/5641045
2025-10-14 02:17:38 +08:00 · 2012-02-07 13:13:06 -05:00
parent 10d085d35e
commit 0176cc7dd2
1 changed files with 567 additions and 0 deletions
--- a/ucs2.diff
+++ b/ucs2.diff
@@ -0,0 +1,567 @@
+This is a dump from Google's source control system of the change
+that removed UCS-2 support from RE2.  As the explanation below
+says, UCS-2 mode is fundamentally at odds with things like ^ and $,
+so it never really worked very well.  But if you are interested in using
+it without those operators, it did work for that.  It assumed that the
+UCS-2 data was in the native host byte order.
+
+If you are interested in adding UCS-2 mode back, this patch might
+be a good starting point.
+
+
+Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
+
+	Retire UCS-2 mode.
+	
+	I added it as an experiment for V8, but it
+	requires 2-byte lookahead to do completely,
+	and RE2 has 1-byte lookahead (enough for UTF-8)
+	as a fairly deep fundamental assumption,
+	so it did not support ^ or $.
+
+==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
+re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
+      cap_[0] = p;
+      if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
+        return true;
+-     if (prog_->flags() & Regexp::UCS2)
+-       p++;
+    }
+    return false;
+  }
+==== re2/compile.cc#17 - re2/compile.cc#18 ====
+re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
+  // Input encodings.
+  enum Encoding {
+    kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
+-   kEncodingUCS2,     // UCS-2 (0-FFFF), native byte order
+    kEncodingLatin1,    // Latin1 (0-FF)
+  };
+  
+re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
+    void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
+    void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
+    void Add_80_10ffff();
+-   void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
+-   void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
+-                    uint8 lo2, uint8 hi2, bool fold2);
+  
+    // New suffix that matches the byte range lo-hi, then goes to next.
+    Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
+re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
+  
+  // Converts rune range lo-hi into a fragment that recognizes
+  // the bytes that would make up those runes in the current
+- // encoding (Latin 1, UTF-8, or UCS-2).
+ // encoding (Latin 1 or UTF-8).
+  // This lets the machine work byte-by-byte even when
+  // using multibyte encodings.
+  
+re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
+      case kEncodingLatin1:
+        AddRuneRangeLatin1(lo, hi, foldcase);
+        break;
+-     case kEncodingUCS2:
+-       AddRuneRangeUCS2(lo, hi, foldcase);
+-       break;
+    }
+  }
+  
+re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
+    AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
+  }
+  
+- // Test whether 16-bit values are big or little endian.
+- static bool BigEndian() {
+-   union {
+-     char byte[2];
+-     int16 endian;
+-   } u;
+- 
+-   u.byte[0] = 1;
+-   u.byte[1] = 2;
+-   return u.endian == 0x0102;
+- }
+- 
+- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
+-                            uint8 lo2, uint8 hi2, bool fold2) {
+-   Inst* ip;
+-   if (reversed_) {
+-     ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
+-     ip = RuneByteSuffix(lo2, hi2, fold2, ip);
+-   } else {
+-     ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
+-     ip = RuneByteSuffix(lo1, hi1, fold1, ip);
+-   }
+-   AddSuffix(ip);
+- }
+- 
+- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
+-   if (lo > hi || lo > 0xFFFF)
+-     return;
+-   if (hi > 0xFFFF)
+-     hi = 0xFFFF;
+- 
+-   // We'll assemble a pattern assuming big endian.
+-   // If the machine isn't, tell Cat to reverse its arguments.
+-   bool oldreversed = reversed_;
+-   if (!BigEndian()) {
+-     reversed_ = !oldreversed;
+-   }
+- 
+-   // Split into bytes.
+-   int lo1 = lo >> 8;
+-   int lo2 = lo & 0xFF;
+-   int hi1 = hi >> 8;
+-   int hi2 = hi & 0xFF;
+- 
+-   if (lo1 == hi1) {
+-     // Easy case: high bits are same in both.
+-     // Only do ASCII case folding on the second byte if the top byte is 00.
+-     AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
+-   } else {
+-     // Harder case: different second byte ranges depending on first byte.
+- 
+-     // Initial fragment.
+-     if (lo2 > 0) {
+-       AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
+-       lo1++;
+-     }
+- 
+-     // Trailing fragment.
+-     if (hi2 < 0xFF) {
+-       AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
+-       hi1--;
+-     }
+- 
+-     // Inner ranges.
+-     if (lo1 <= hi1) {
+-       AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
+-     }
+-   }
+- 
+-   // Restore reverse setting.
+-   reversed_ = oldreversed;
+- }
+- 
+  // Table describing how to make a UTF-8 matching machine
+  // for the rune range 80-10FFFF (Runeself-Runemax).
+  // This range happens frequently enough (for example /./ and /[^a-z]/)
+re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
+  
+  Frag Compiler::Literal(Rune r, bool foldcase) {
+    switch (encoding_) {
+-     default:  // UCS-2 or something new
+-       BeginRange();
+-       AddRuneRange(r, r, foldcase);
+-       return EndRange();
+     default:
+       return kNullFrag;
+  
+      case kEncodingLatin1:
+        return ByteRange(r, r, foldcase);
+re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
+  
+    if (re->parse_flags() & Regexp::Latin1)
+      c.encoding_ = kEncodingLatin1;
+-   else if (re->parse_flags() & Regexp::UCS2)
+-     c.encoding_ = kEncodingUCS2;
+    c.reversed_ = reversed;
+    if (max_mem <= 0) {
+      c.max_inst_ = 100000;  // more than enough
+re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
+      c.prog_->set_start_unanchored(c.prog_->start());
+    } else {
+      Frag dot;
+-     if (c.encoding_ == kEncodingUCS2) {
+-       dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
+-     } else {
+-       dot = c.ByteRange(0x00, 0xFF, false);
+-     }
+     dot = c.ByteRange(0x00, 0xFF, false);
+      Frag dotloop = c.Star(dot, true);
+      Frag unanchored = c.Cat(dotloop, all);
+      c.prog_->set_start_unanchored(unanchored.begin);
+==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
+re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
+    const char* bp = context.begin();
+    int c = -1;
+    int wasword = 0;
+-   bool ucs2 = prog_->flags() & Regexp::UCS2;
+  
+    if (text.begin() > context.begin()) {
+      c = text.begin()[-1] & 0xFF;
+re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
+        // If there's a required first byte for an unanchored search
+        // and we're not in the middle of any possible matches,
+        // use memchr to search for the byte quickly.
+-       if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
+       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
+            p < text.end() && (p[0] & 0xFF) != first_byte_) {
+          p = reinterpret_cast<const char*>(memchr(p, first_byte_,
+                                                   text.end() - p));
+re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
+          flag = Prog::EmptyFlags(context, p);
+        }
+  
+-       // In UCS-2 mode, if we need to start a new thread,
+-       // make sure to do it on an even boundary.
+-       if(ucs2 && runq->size() == 0 &&
+-           (p - context.begin()) % 2 && p < text.end()) {
+-         p++;
+-         flag = Prog::EmptyFlags(context, p);
+-       }
+- 
+        // Steal match storage (cleared but unused as of yet)
+        // temporarily to hold match boundaries for new thread.
+-       // In UCS-2 mode, only start the thread on a 2-byte boundary.
+-       if(!ucs2 || (p - context.begin()) % 2 == 0) {
+-         match_[0] = p;
+-         AddToThreadq(runq, start_, flag, p, match_);
+-         match_[0] = NULL;
+-       }
+       match_[0] = p;
+       AddToThreadq(runq, start_, flag, p, match_);
+       match_[0] = NULL;
+      }
+  
+      // If all the threads have died, stop early.
+==== re2/parse.cc#22 - re2/parse.cc#23 ====
+re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
+      status_(status), stacktop_(NULL), ncap_(0) {
+    if (flags_ & Latin1)
+      rune_max_ = 0xFF;
+-   else if (flags & UCS2)
+-     rune_max_ = 0xFFFF;
+    else
+      rune_max_ = Runemax;
+  }
+re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
+  bool Regexp::ParseState::PushCarat() {
+    if (flags_ & OneLine) {
+      return PushSimpleOp(kRegexpBeginText);
+-   } else {
+-     if (flags_ & UCS2) {
+-       status_->set_code(kRegexpUnsupported);
+-       status_->set_error_arg("multiline ^ in UCS-2 mode");
+-       return false;
+-     }
+-     return PushSimpleOp(kRegexpBeginLine);
+    }
+   return PushSimpleOp(kRegexpBeginLine);
+  }
+  
+  // Pushes a \b or \B onto the stack.
+  bool Regexp::ParseState::PushWordBoundary(bool word) {
+-   if (flags_ & UCS2) {
+-     status_->set_code(kRegexpUnsupported);
+-     status_->set_error_arg("\\b or \\B in UCS-2 mode");
+-     return false;
+-   }
+    if (word)
+      return PushSimpleOp(kRegexpWordBoundary);
+    return PushSimpleOp(kRegexpNoWordBoundary);
+re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
+      bool ret = PushSimpleOp(kRegexpEndText);
+      flags_ = oflags;
+      return ret;
+-   }
+-   if (flags_ & UCS2) {
+-     status_->set_code(kRegexpUnsupported);
+-     status_->set_error_arg("multiline $ in UCS-2 mode");
+-     return false;
+    }
+    return PushSimpleOp(kRegexpEndLine);
+  }
+==== re2/re2.cc#34 - re2/re2.cc#35 ====
+re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
+        return RE2::ErrorBadUTF8;
+      case re2::kRegexpBadNamedCapture:
+        return RE2::ErrorBadNamedCapture;
+-     case re2::kRegexpUnsupported:
+-       return RE2::ErrorUnsupported;
+    }
+    return RE2::ErrorInternal;
+  }
+re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
+        break;
+      case RE2::Options::EncodingLatin1:
+        flags |= Regexp::Latin1;
+-       break;
+-     case RE2::Options::EncodingUCS2:
+-       flags |= Regexp::UCS2;
+        break;
+    }
+  
+==== re2/re2.h#36 - re2/re2.h#37 ====
+re2/re2.h#36:246,252 - re2/re2.h#37:246,251
+      ErrorBadUTF8,            // invalid UTF-8 in regexp
+      ErrorBadNamedCapture,    // bad named capture group
+      ErrorPatternTooLarge,    // pattern too large (compile failed)
+-     ErrorUnsupported,        // unsupported feature (in UCS-2 mode)
+    };
+  
+    // Predefined common options.
+re2/re2.h#36:570,576 - re2/re2.h#37:569,574
+  
+      enum Encoding {
+        EncodingUTF8 = 1,
+-       EncodingUCS2,      // 16-bit Unicode 0-FFFF only
+        EncodingLatin1
+      };
+  
+==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
+re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
+  // the regexp that remains after the prefix.  The prefix might
+  // be ASCII case-insensitive.
+  bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
+-   // Don't even bother for UCS-2; it's time to throw that code away.
+-   if (parse_flags_ & UCS2)
+-     return false;
+- 
+    // No need for a walker: the regexp must be of the form
+    // 1. some number of ^ anchors
+    // 2. a literal char or string
+==== re2/regexp.h#20 - re2/regexp.h#21 ====
+re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
+    kRegexpBadPerlOp,          // bad perl operator
+    kRegexpBadUTF8,            // invalid UTF-8 in regexp
+    kRegexpBadNamedCapture,    // bad named capture
+-   kRegexpUnsupported,        // unsupported operator
+  };
+  
+  // Error status for certain operations.
+re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
+                             //   \Q and \E to disable/enable metacharacters
+                             //   (?P<name>expr) for named captures
+                             //   \C to match any single byte
+-     UCS2         = 1<<10,  // Text is in UCS-2, regexp is in UTF-8.
+-     UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
+     UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
+                             //   and \P{Han} for its negation.
+-     NeverNL      = 1<<12,  // Never match NL, even if the regexp mentions
+     NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
+                             //   it explicitly.
+  
+      // As close to Perl as we can get.
+==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
+re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
+      cap_[0] = p;
+      if (Visit(prog_->start(), p))  // Match must be leftmost; done.
+        return true;
+-     if (prog_->flags() & Regexp::UCS2)
+-       p++;
+    }
+    return false;
+  }
+==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
+re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
+  static ParseMode parse_modes[] = {
+    { single_line,                   "single-line"          },
+    { single_line|Regexp::Latin1,    "single-line, latin1"  },
+-   { single_line|Regexp::UCS2,     "single-line, ucs2"   },
+    { multi_line,                    "multiline"            },
+    { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
+    { multi_line|Regexp::Latin1,     "multiline, latin1"    },
+-   { multi_line|Regexp::UCS2,      "multiline, ucs2"     },
+  };
+  
+  static string FormatMode(Regexp::ParseFlags flags) {
+re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
+    RegexpStatus status;
+    regexp_ = Regexp::Parse(regexp_str, flags, &status);
+    if (regexp_ == NULL) {
+-     if (status.code() != kRegexpUnsupported) {
+-       LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+-                 << " mode: " << FormatMode(flags);
+-       error_ = true;
+-     }
+     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+               << " mode: " << FormatMode(flags);
+     error_ = true;
+      return;
+    }
+    prog_ = regexp_->CompileToProg(0);
+re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
+      RE2::Options options;
+      if (flags & Regexp::Latin1)
+        options.set_encoding(RE2::Options::EncodingLatin1);
+-     else if (flags & Regexp::UCS2)
+-       options.set_encoding(RE2::Options::EncodingUCS2);
+      if (kind_ == Prog::kLongestMatch)
+        options.set_longest_match(true);
+      re2_ = new RE2(re, options);
+re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
+      delete re2_;
+  }
+  
+- // Converts UTF-8 string in text into UCS-2 string in new_text.
+- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
+-   const char* p = text.begin();
+-   const char* ep = text.end();
+-   uint16* q = new uint16[ep - p];
+-   uint16* q0 = q;
+- 
+-   int n;
+-   Rune r;
+-   for (; p < ep; p += n) {
+-     if (!fullrune(p, ep - p)) {
+-       delete[] q0;
+-       return false;
+-     }
+-     n = chartorune(&r, p);
+-     if (r > 0xFFFF) {
+-       delete[] q0;
+-       return false;
+-     }
+-     *q++ = r;
+-   }
+-   *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
+-   return true;
+- }
+- 
+- // Rewrites *sp from being a pointer into text8 (UTF-8)
+- // to being a pointer into text16 (equivalent text but in UCS-2).
+- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
+-                               StringPiece *sp) {
+-   if (sp->begin() == NULL && text8.begin() != NULL)
+-     return;
+- 
+-   int nrune = 0;
+-   int n;
+-   Rune r;
+-   const char* p = text8.begin();
+-   const char* ep = text8.end();
+-   const char* spbegin = NULL;
+-   const char* spend = NULL;
+-   for (;;) {
+-     if (p == sp->begin())
+-       spbegin = text16.begin() + sizeof(uint16)*nrune;
+-     if (p == sp->end())
+-       spend = text16.begin() + sizeof(uint16)*nrune;
+-     if (p >= ep)
+-       break;
+-     n = chartorune(&r, p);
+-     p += n;
+-     nrune++;
+-   }
+-   if (spbegin == NULL || spend == NULL) {
+-     LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
+-                << CEscape(text8) << " "
+-                << (int)(sp->begin() - text8.begin()) << " "
+-                << (int)(sp->end() - text8.begin());
+-   }
+-   *sp = StringPiece(spbegin, spend - spbegin);
+- }
+- 
+- // Rewrites *sp from begin a pointer into text16 (UCS-2)
+- // to being a pointer into text8 (equivalent text but in UTF-8).
+- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
+-                               StringPiece* sp) {
+-   if (sp->begin() == NULL)
+-     return;
+- 
+-   int nrune = 0;
+-   int n;
+-   Rune r;
+-   const char* p = text8.begin();
+-   const char* ep = text8.end();
+-   const char* spbegin = NULL;
+-   const char* spend = NULL;
+-   for (;;) {
+-     if (nrune == (sp->begin() - text16.begin())/2)
+-       spbegin = p;
+-     if (nrune == (sp->end() - text16.begin())/2)
+-       spend = p;
+-     if (p >= ep)
+-       break;
+-     n = chartorune(&r, p);
+-     p += n;
+-     nrune++;
+-   }
+-   if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
+-     LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
+-                << CEscape(text16) << " "
+-                << (int)(sp->begin() - text16.begin()) << " "
+-                << (int)(sp->end() - text16.begin());
+-   }
+-   *sp = StringPiece(spbegin, spend - spbegin);
+- }
+- 
+  // Runs a single search using the named engine type.
+  // This interface hides all the irregularities of the various
+  // engine interfaces from the rest of this file.
+re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
+  
+    StringPiece text = orig_text;
+    StringPiece context = orig_context;
+-   bool ucs2 = false;
+  
+-   if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
+-     if (!ConvertUTF8ToUCS2(orig_context, &context)) {
+-       result->skipped = true;
+-       return;
+-     }
+- 
+-     // Rewrite context to refer to new text.
+-     AdjustUTF8ToUCS2(orig_context, context, &text);
+-     ucs2 = true;
+-   }
+- 
+    switch (type) {
+      default:
+        LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
+      }
+    }
+  
+-   // If we did UCS-2 matching, rewrite the matches to refer
+-   // to the original UTF-8 text.
+-   if (ucs2) {
+-     if (result->matched) {
+-       if (result->have_submatch0) {
+-         AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
+-       } else if (result->have_submatch) {
+-         for (int i = 0; i < nsubmatch; i++) {
+-           AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
+-         }
+-       }
+-     }
+-     delete[] context.begin();
+-   }
+- 
+    if (!result->matched)
+      memset(result->submatch, 0, sizeof result->submatch);
+  }
+re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
+    return true;
+  }
+  
+- // Check whether text uses only Unicode points <= 0xFFFF
+- // (in the BMP).
+- static bool IsBMP(const StringPiece& text) {
+-   const char* p = text.begin();
+-   const char* ep = text.end();
+-   while (p < ep) {
+-     if (!fullrune(p, ep - p))
+-       return false;
+-     Rune r;
+-     p += chartorune(&r, p);
+-     if (r > 0xFFFF)
+-       return false;
+-   }
+-   return true;
+- }
+- 
+  // Runs a single test.
+  bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+                             Prog::Anchor anchor) {
+re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
+    Result correct;
+    RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+    if (correct.skipped) {
+-     if (regexp_ == NULL || !IsBMP(context))  // okay to skip in UCS-2 mode
+     if (regexp_ == NULL)
+        return true;
+      LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+                 << " " << FormatMode(flags_);