| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585 |
- /*
- * Copyright 2017 Google
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "Firestore/Port/ordered_code.h"
- #include <assert.h>
- #include <absl/base/internal/endian.h>
- #include <absl/base/internal/unaligned_access.h>
- #include <absl/base/port.h>
- #include <leveldb/db.h> // For Slice
- #include "Firestore/Port/bits.h"
- #define UNALIGNED_LOAD32 ABSL_INTERNAL_UNALIGNED_LOAD32
- #define UNALIGNED_LOAD64 ABSL_INTERNAL_UNALIGNED_LOAD64
- #define UNALIGNED_STORE32 ABSL_INTERNAL_UNALIGNED_STORE32
- #define UNALIGNED_STORE64 ABSL_INTERNAL_UNALIGNED_STORE64
- // We encode a string in different ways depending on whether the item
- // should be in lexicographically increasing or decreasing order.
- //
- //
- // Lexicographically increasing order
- //
- // We want a string-to-string mapping F(x) such that for any two strings
- //
- // x < y => F(x) < F(y)
- //
- // In addition to the normal characters '\x00' through '\xff', we want to
- // encode a few extra symbols in strings:
- //
- // <sep> Separator between items
- // <infinity> Infinite string
- //
- // Therefore we need an alphabet with at least 258 symbols. Each
- // character '\1' through '\xfe' is mapped to itself. The other four are
- // encoded into two-letter sequences starting with '\0' and '\xff':
- //
- // <sep> encoded as => \0\1
- // \0 encoded as => \0\xff
- // \xff encoded as => \xff\x00
- // <infinity> encoded as => \xff\xff
- //
- // The remaining two-letter sequences starting with '\0' and '\xff' are
- // currently unused.
- //
- // F(<infinity>) is defined above. For any finite string x, F(x) is the
- // the encodings of x's characters followed by the encoding for <sep>. The
- // ordering of two finite strings is the same as the ordering of the
- // respective characters at the first position where they differ, which in
- // turn is the same as the ordering of the encodings of those two
- // characters. Moreover, for every finite string x, F(x) < F(<infinity>).
- namespace Firestore {
- using leveldb::Slice;
- static const char kEscape1 = '\000';
- static const char kNullCharacter = '\xff'; // Combined with kEscape1
- static const char kSeparator = '\001'; // Combined with kEscape1
- static const char kEscape2 = '\xff';
- static const char kInfinity = '\xff'; // Combined with kEscape2
- static const char kFFCharacter = '\000'; // Combined with kEscape2
- static const char kEscape1_Separator[2] = {kEscape1, kSeparator};
- // Append to "*dest" the "len" bytes starting from "*src".
- inline static void AppendBytes(std::string* dest, const char* src, size_t len) {
- dest->append(src, len);
- }
- inline bool IsSpecialByte(char c) { return ((unsigned char)(c + 1)) < 2; }
- // Returns 0 if one or more of the bytes in the specified uint32 value
- // are the special values 0 or 255, and returns 4 otherwise. The
- // result of this routine can be added to "p" to either advance past
- // the next 4 bytes if they do not contain a special byte, or to
- // remain on this set of four bytes if they contain the next special
- // byte occurrence.
- //
- // REQUIRES: v is the value of loading the next 4 bytes from "*p" (we
- // pass in v rather than loading it because in some cases, the client
- // may already have the value in a register: "p" is just used for
- // assertion checking).
- inline int AdvanceIfNoSpecialBytes(uint32_t v_32, const char* p) {
- assert(UNALIGNED_LOAD32(p) == v_32);
- // See comments in SkipToNextSpecialByte if you wish to
- // understand this expression (which checks for the occurrence
- // of the special byte values 0 or 255 in any of the bytes of v_32).
- if ((v_32 - 0x01010101u) & ~(v_32 + 0x01010101u) & 0x80808080u) {
- // Special byte is in p[0..3]
- assert(IsSpecialByte(p[0]) || IsSpecialByte(p[1]) || IsSpecialByte(p[2]) ||
- IsSpecialByte(p[3]));
- return 0;
- } else {
- assert(!IsSpecialByte(p[0]));
- assert(!IsSpecialByte(p[1]));
- assert(!IsSpecialByte(p[2]));
- assert(!IsSpecialByte(p[3]));
- return 4;
- }
- }
- // Return a pointer to the first byte in the range "[start..limit)"
- // whose value is 0 or 255 (kEscape1 or kEscape2). If no such byte
- // exists in the range, returns "limit".
- inline const char* SkipToNextSpecialByte(const char* start, const char* limit) {
- // If these constants were ever changed, this routine needs to change
- assert(kEscape1 == 0);
- assert((kEscape2 & 0xffu) == 255u);
- const char* p = start;
- while (p + 8 <= limit) {
- // Find out if any of the next 8 bytes are either 0 or 255 (our
- // two characters that require special handling). We do this using
- // the technique described in:
- //
- // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
- //
- // We use the test (x + 1) < 2 to check x = 0 or -1(255)
- //
- // If x is a byte value (0x00..0xff):
- // (x - 0x01) & 0x80 is true only when x = 0x81..0xff, 0x00
- // ~(x + 0x01) & 0x80 is true only when x = 0x00..0x7e, 0xff
- // The intersection of the above two sets is x = 0x00 or 0xff.
- // We can ignore carry between bytes because only x = 0x00 or 0xff
- // can cause carry in the expression -- and such x already makes the
- // result value non-zero.
- uint64_t v = UNALIGNED_LOAD64(p);
- bool hasZeroOr255Byte = (v - 0x0101010101010101ull) &
- ~(v + 0x0101010101010101ull) &
- 0x8080808080808080ull;
- if (!hasZeroOr255Byte) {
- // No special values in the next 8 bytes
- p += 8;
- } else {
- // We know the next 8 bytes have a special byte: find it
- #ifdef IS_LITTLE_ENDIAN
- uint32_t v_32 = static_cast<uint32_t>(v); // Low 32 bits of v
- #else
- uint32_t v_32 = UNALIGNED_LOAD32(p);
- #endif
- // Test 32 bits at once to see if special byte is in next 4 bytes
- // or the following 4 bytes
- p += AdvanceIfNoSpecialBytes(v_32, p);
- if (IsSpecialByte(p[0])) return p;
- if (IsSpecialByte(p[1])) return p + 1;
- if (IsSpecialByte(p[2])) return p + 2;
- assert(IsSpecialByte(p[3])); // Last byte must be the special one
- return p + 3;
- }
- }
- if (p + 4 <= limit) {
- uint32_t v_32 = UNALIGNED_LOAD32(p);
- p += AdvanceIfNoSpecialBytes(v_32, p);
- }
- while (p < limit && !IsSpecialByte(*p)) {
- p++;
- }
- return p;
- }
- // Expose SkipToNextSpecialByte for testing purposes
- const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start,
- const char* limit) {
- return SkipToNextSpecialByte(start, limit);
- }
- // Helper routine to encode "s" and append to "*dest", escaping special
- // characters.
- inline static void EncodeStringFragment(std::string* dest, Slice s) {
- const char* p = s.data();
- const char* limit = p + s.size();
- const char* copy_start = p;
- while (true) {
- p = SkipToNextSpecialByte(p, limit);
- if (p >= limit) break; // No more special characters that need escaping
- char c = *(p++);
- assert(IsSpecialByte(c));
- if (c == kEscape1) {
- AppendBytes(dest, copy_start, p - copy_start - 1);
- dest->push_back(kEscape1);
- dest->push_back(kNullCharacter);
- copy_start = p;
- } else {
- assert(c == kEscape2);
- AppendBytes(dest, copy_start, p - copy_start - 1);
- dest->push_back(kEscape2);
- dest->push_back(kFFCharacter);
- copy_start = p;
- }
- }
- if (p > copy_start) {
- AppendBytes(dest, copy_start, p - copy_start);
- }
- }
- void OrderedCode::WriteString(std::string* dest, Slice s) {
- EncodeStringFragment(dest, s);
- AppendBytes(dest, kEscape1_Separator, 2);
- }
- // Return number of bytes needed to encode the non-length portion
- // of val in ordered coding. Returns number in range [0,8].
- static inline unsigned int OrderedNumLength(uint64_t val) {
- const int lg = Bits::Log2Floor64(val); // -1 if val==0
- return static_cast<unsigned int>(lg + 1 + 7) / 8;
- }
- // Append n bytes from src to *dst.
- // REQUIRES: n <= 9
- // REQUIRES: src[0..8] are readable bytes (even if n is smaller)
- //
- // If we use string::append() instead of this routine, it increases the
- // runtime of WriteNumIncreasing from ~9ns to ~13ns.
- static inline void AppendUpto9(std::string* dst, const char* src,
- unsigned int n) {
- dst->append(src, 9); // Fixed-length append
- const size_t extra = 9 - n; // How many extra bytes we added
- dst->erase(dst->size() - extra, extra);
- }
- void OrderedCode::WriteNumIncreasing(std::string* dest, uint64_t val) {
- // Values are encoded with a single byte length prefix, followed
- // by the actual value in big-endian format with leading 0 bytes
- // dropped.
- // 8 bytes for value plus one byte for length. In addition, we have
- // 8 extra bytes at the end so that we can have a fixed-length append
- // call on *dest.
- char buf[17];
- UNALIGNED_STORE64(buf + 1, absl::ghtonll(val)); // buf[0] may be needed for length
- const unsigned int length = OrderedNumLength(val);
- char* start = buf + 9 - length - 1;
- *start = length;
- AppendUpto9(dest, start, length + 1);
- }
- inline static void WriteInfinityInternal(std::string* dest) {
- // Make an array so that we can just do one string operation for performance
- static const char buf[2] = {kEscape2, kInfinity};
- dest->append(buf, 2);
- }
- void OrderedCode::WriteInfinity(std::string* dest) {
- WriteInfinityInternal(dest);
- }
- void OrderedCode::WriteTrailingString(std::string* dest, Slice str) {
- dest->append(str.data(), str.size());
- }
- // Parse the encoding of a string previously encoded with or without
- // inversion. If parse succeeds, return true, consume encoding from
- // "*src", and if result != NULL append the decoded string to "*result".
- // Otherwise, return false and leave both undefined.
- inline static bool ReadStringInternal(Slice* src, std::string* result) {
- const char* start = src->data();
- const char* string_limit = src->data() + src->size();
- // We only scan up to "limit-2" since a valid string must end with
- // a two character terminator: 'kEscape1 kSeparator'
- const char* limit = string_limit - 1;
- const char* copy_start = start;
- while (true) {
- start = SkipToNextSpecialByte(start, limit);
- if (start >= limit) break; // No terminator sequence found
- const char c = *(start++);
- // If inversion is required, instead of inverting 'c', we invert the
- // character constants to which 'c' is compared. We get the same
- // behavior but save the runtime cost of inverting 'c'.
- assert(IsSpecialByte(c));
- if (c == kEscape1) {
- if (result) {
- AppendBytes(result, copy_start, start - copy_start - 1);
- }
- // kEscape1 kSeparator ends component
- // kEscape1 kNullCharacter represents '\0'
- const char next = *(start++);
- if (next == kSeparator) {
- src->remove_prefix(start - src->data());
- return true;
- } else if (next == kNullCharacter) {
- if (result) {
- *result += '\0';
- }
- } else {
- return false;
- }
- copy_start = start;
- } else {
- assert(c == kEscape2);
- if (result) {
- AppendBytes(result, copy_start, start - copy_start - 1);
- }
- // kEscape2 kFFCharacter represents '\xff'
- // kEscape2 kInfinity is an error
- const char next = *(start++);
- if (next == kFFCharacter) {
- if (result) {
- *result += '\xff';
- }
- } else {
- return false;
- }
- copy_start = start;
- }
- }
- return false;
- }
- bool OrderedCode::ReadString(Slice* src, std::string* result) {
- return ReadStringInternal(src, result);
- }
- bool OrderedCode::ReadNumIncreasing(Slice* src, uint64_t* result) {
- if (src->empty()) {
- return false; // Not enough bytes
- }
- // Decode length byte
- const int len = static_cast<unsigned char>((*src)[0]);
- // If len > 0 and src is longer than 1, the first byte of "payload"
- // must be non-zero (otherwise the encoding is not minimal).
- // In opt mode, we don't enforce that encodings must be minimal.
- assert(0 == len || src->size() == 1 || (*src)[1] != '\0');
- if (len + 1 > src->size() || len > 8) {
- return false; // Not enough bytes or too many bytes
- }
- if (result) {
- uint64_t tmp = 0;
- for (int i = 0; i < len; i++) {
- tmp <<= 8;
- tmp |= static_cast<unsigned char>((*src)[1 + i]);
- }
- *result = tmp;
- }
- src->remove_prefix(len + 1);
- return true;
- }
- inline static bool ReadInfinityInternal(Slice* src) {
- if (src->size() >= 2 && ((*src)[0] == kEscape2) && ((*src)[1] == kInfinity)) {
- src->remove_prefix(2);
- return true;
- } else {
- return false;
- }
- }
- bool OrderedCode::ReadInfinity(Slice* src) { return ReadInfinityInternal(src); }
- inline static bool ReadStringOrInfinityInternal(Slice* src, std::string* result,
- bool* inf) {
- if (ReadInfinityInternal(src)) {
- if (inf) *inf = true;
- return true;
- }
- // We don't use ReadStringInternal here because that would inline
- // the whole encoded string parsing code here. Depending on INVERT, only
- // one of the following two calls will be generated at compile time.
- bool success = OrderedCode::ReadString(src, result);
- if (success) {
- if (inf) *inf = false;
- return true;
- } else {
- return false;
- }
- }
- bool OrderedCode::ReadStringOrInfinity(Slice* src, std::string* result,
- bool* inf) {
- return ReadStringOrInfinityInternal(src, result, inf);
- }
- bool OrderedCode::ReadTrailingString(Slice* src, std::string* result) {
- if (result) result->assign(src->data(), src->size());
- src->remove_prefix(src->size());
- return true;
- }
- void OrderedCode::TEST_Corrupt(std::string* str, int k) {
- int seen_seps = 0;
- for (int i = 0; i < str->size() - 1; i++) {
- if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) {
- seen_seps++;
- if (seen_seps == k) {
- (*str)[i + 1] = kSeparator + 1;
- return;
- }
- }
- }
- }
- // Signed number encoding/decoding /////////////////////////////////////
- //
- // The format is as follows:
- //
- // The first bit (the most significant bit of the first byte)
- // represents the sign, 0 if the number is negative and
- // 1 if the number is >= 0.
- //
- // Any unbroken sequence of successive bits with the same value as the sign
- // bit, up to 9 (the 8th and 9th are the most significant bits of the next
- // byte), are size bits that count the number of bytes after the first byte.
- // That is, the total length is between 1 and 10 bytes.
- //
- // The value occupies the bits after the sign bit and the "size bits"
- // till the end of the string, in network byte order. If the number
- // is negative, the bits are in 2-complement.
- //
- //
- // Example 1: number 0x424242 -> 4 byte big-endian hex string 0xf0424242:
- //
- // +---------------+---------------+---------------+---------------+
- // 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0
- // +---------------+---------------+---------------+---------------+
- // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
- // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- // | | | | payload: the remaining bits after the sign and size bits
- // | | | | and the delimiter bit, the value is 0x424242
- // | | | |
- // | size bits: 3 successive bits with the same value as the sign bit
- // | (followed by a delimiter bit with the opposite value)
- // | mean that there are 3 bytes after the first byte, 4 total
- // |
- // sign bit: 1 means that the number is non-negative
- //
- // Example 2: negative number -0x800 -> 2 byte big-endian hex string 0x3800:
- //
- // +---------------+---------------+
- // 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
- // +---------------+---------------+
- // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
- // | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- // | | payload: the remaining bits after the sign and size bits and the
- // | | delimiter bit, 2-complement because of the negative sign,
- // | | value is ~0x7ff, represents the value -0x800
- // | |
- // | size bits: 1 bit with the same value as the sign bit
- // | (followed by a delimiter bit with the opposite value)
- // | means that there is 1 byte after the first byte, 2 total
- // |
- // sign bit: 0 means that the number is negative
- //
- //
- // Compared with the simpler unsigned format used for uint64_t numbers,
- // this format is more compact for small numbers, namely one byte encodes
- // numbers in the range [-64,64), two bytes cover the range [-2^13,2^13), etc.
- // In general, n bytes encode numbers in the range [-2^(n*7-1),2^(n*7-1)).
- // (The cross-over point for compactness of representation is 8 bytes,
- // where this format only covers the range [-2^55,2^55),
- // whereas an encoding with sign bit and length in the first byte and
- // payload in all following bytes would cover [-2^56,2^56).)
- static const int kMaxSigned64Length = 10;
- // This array maps encoding length to header bits in the first two bytes.
- static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = {
- {0, 0}, {'\x80', 0}, {'\xc0', 0}, {'\xe0', 0},
- {'\xf0', 0}, {'\xf8', 0}, {'\xfc', 0}, {'\xfe', 0},
- {'\xff', 0}, {'\xff', '\x80'}, {'\xff', '\xc0'}};
- // This array maps encoding lengths to the header bits that overlap with
- // the payload and need fixing when reading.
- static const uint64_t kLengthToMask[1 + kMaxSigned64Length] = {
- 0ULL,
- 0x80ULL,
- 0xc000ULL,
- 0xe00000ULL,
- 0xf0000000ULL,
- 0xf800000000ULL,
- 0xfc0000000000ULL,
- 0xfe000000000000ULL,
- 0xff00000000000000ULL,
- 0x8000000000000000ULL,
- 0ULL};
- // This array maps the number of bits in a number to the encoding
- // length produced by WriteSignedNumIncreasing.
- // For positive numbers, the number of bits is 1 plus the most significant
- // bit position (the highest bit position in a positive int64_t is 63).
- // For a negative number n, we count the bits in ~n.
- // That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1].
- static const int8_t kBitsToLength[1 + 63] = {
- 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
- 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
- 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
- // Calculates the encoding length in bytes of the signed number n.
- static inline int SignedEncodingLength(int64_t n) {
- return kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1];
- }
- // Slightly faster version for n > 0.
- static inline int SignedEncodingLengthPositive(int64_t n) {
- return kBitsToLength[Bits::Log2FloorNonZero64(n) + 1];
- }
- void OrderedCode::WriteSignedNumIncreasing(std::string* dest, int64_t val) {
- const uint64_t x = val < 0 ? ~val : val;
- if (x < 64) { // fast path for encoding length == 1
- *dest += kLengthToHeaderBits[1][0] ^ val;
- return;
- }
- // buf = val in network byte order, sign extended to 10 bytes
- const char sign_byte = val < 0 ? '\xff' : '\0';
- char buf[10] = {
- sign_byte, sign_byte,
- };
- UNALIGNED_STORE64(buf + 2, absl::ghtonll(val));
- static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
- const int len = SignedEncodingLengthPositive(x);
- assert(len >= 2);
- char* const begin = buf + sizeof(buf) - len;
- begin[0] ^= kLengthToHeaderBits[len][0];
- begin[1] ^= kLengthToHeaderBits[len][1]; // ok because len >= 2
- dest->append(begin, len);
- }
- bool OrderedCode::ReadSignedNumIncreasing(Slice* src, int64_t* result) {
- if (src->empty()) return false;
- const uint64_t xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
- const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff);
- // now calculate and test length, and set x to raw (unmasked) result
- int len;
- uint64_t x;
- if (first_byte != 0xff) {
- len = 7 - Bits::Log2FloorNonZero(first_byte ^ 0xff);
- if (src->size() < len) return false;
- x = xor_mask; // sign extend using xor_mask
- for (int i = 0; i < len; ++i)
- x = (x << 8) | static_cast<unsigned char>((*src)[i]);
- } else {
- len = 8;
- if (src->size() < len) return false;
- const unsigned char second_byte = (*src)[1] ^ (xor_mask & 0xff);
- if (second_byte >= 0x80) {
- if (second_byte < 0xc0) {
- len = 9;
- } else {
- const unsigned char third_byte = (*src)[2] ^ (xor_mask & 0xff);
- if (second_byte == 0xc0 && third_byte < 0x80) {
- len = 10;
- } else {
- return false; // either len > 10 or len == 10 and #bits > 63
- }
- }
- if (src->size() < len) return false;
- }
- x = absl::gntohll(UNALIGNED_LOAD64(src->data() + len - 8));
- }
- x ^= kLengthToMask[len]; // remove spurious header bits
- assert(len == SignedEncodingLength(x));
- if (result) *result = x;
- src->remove_prefix(len);
- return true;
- }
- } // namespace Firestore
|