string_data.hpp 12.6 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
/*************************************************************************
 *
 * Copyright 2016 Realm Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **************************************************************************/

#ifndef REALM_STRING_HPP
#define REALM_STRING_HPP

#include <realm/null.hpp>
#include <realm/util/features.h>
#include <realm/util/optional.hpp>

#include <algorithm>
#include <array>
#include <cfloat>
#include <cmath>
#include <cstddef>
#include <cstring>
#include <ostream>
#include <string>

namespace realm {

/// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms.
/// This is what libc++ does, and it is a good general choice for a
/// non-cryptographic hash function (suitable for std::unordered_map etc.).
size_t murmur2_or_cityhash(const unsigned char* data, size_t len) noexcept;

uint_least32_t murmur2_32(const unsigned char* data, size_t len) noexcept;
uint_least64_t cityhash_64(const unsigned char* data, size_t len) noexcept;


/// A reference to a chunk of character data.
///
/// An instance of this class can be thought of as a type tag on a region of
/// memory. It does not own the referenced memory, nor does it in any other way
/// attempt to manage the lifetime of it.
///
/// A null character inside the referenced region is considered a part of the
/// string by Realm.
///
/// For compatibility with C-style strings, when a string is stored in a Realm
/// database, it is always followed by a terminating null character, regardless
/// of whether the string itself has internal null characters. This means that
/// when a StringData object is extracted from Realm, the referenced region is
/// guaranteed to be followed immediately by an extra null character, but that
/// null character is not inside the referenced region. Therefore, all of the
/// following forms are guaranteed to return a pointer to a null-terminated
/// string:
///
/// \code{.cpp}
///
///   group.get_table_name(...).data()
///   table.get_column_name().data()
///   table.get_string(...).data()
///   table.get_mixed(...).get_string().data()
///
/// \endcode
///
/// Note that in general, no assumptions can be made about what follows a string
/// that is referenced by a StringData object, or whether anything follows it at
/// all. In particular, the receiver of a StringData object cannot assume that
/// the referenced string is followed by a null character unless there is an
/// externally provided guarantee.
///
/// This class makes it possible to distinguish between a 'null' reference and a
/// reference to the empty string (see is_null()).
///
/// \sa BinaryData
/// \sa Mixed
class StringData {
public:
    /// Construct a null reference.
    StringData() noexcept;

    StringData(int) = delete;

    /// If \a external_data is 'null', \a data_size must be zero.
    StringData(const char* external_data, size_t data_size) noexcept;

    template <class T, class A>
    StringData(const std::basic_string<char, T, A>&);

    template <class T, class A>
    operator std::basic_string<char, T, A>() const;

    template <class T, class A>
    StringData(const util::Optional<std::basic_string<char, T, A>>&);

    StringData(std::string_view sv);

    StringData(const null&) noexcept;

    /// Initialize from a zero terminated C style string. Pass null to construct
    /// a null reference.
    StringData(const char* c_str) noexcept;

    char operator[](size_t i) const noexcept;

    const char* data() const noexcept;
    size_t size() const noexcept;

    /// Is this a null reference?
    ///
    /// An instance of StringData is a null reference when, and only when the
    /// stored size is zero (size()) and the stored pointer is the null pointer
    /// (data()).
    ///
    /// In the case of the empty string, the stored size is still zero, but the
    /// stored pointer is **not** the null pointer. It could for example point
    /// to the empty string literal. Note that the actual value of the pointer
    /// is immaterial in this case (as long as it is not zero), because when the
    /// size is zero, it is an error to dereference the pointer.
    ///
    /// Conversion of a StringData object to `bool` yields the logical negation
    /// of the result of calling this function. In other words, a StringData
    /// object is converted to true if it is not the null reference, otherwise
    /// it is converted to false.
    bool is_null() const noexcept;

    friend bool operator==(const StringData&, const StringData&) noexcept;
    friend bool operator!=(const StringData&, const StringData&) noexcept;

    //@{
    /// Trivial bytewise lexicographical comparison.
    friend bool operator<(const StringData&, const StringData&) noexcept;
    friend bool operator>(const StringData&, const StringData&) noexcept;
    friend bool operator<=(const StringData&, const StringData&) noexcept;
    friend bool operator>=(const StringData&, const StringData&) noexcept;
    //@}

    bool begins_with(StringData) const noexcept;
    bool ends_with(StringData) const noexcept;
    bool contains(StringData) const noexcept;
    bool contains(StringData d, const std::array<uint8_t, 256> &charmap) const noexcept;
    
    // Wildcard matching ('?' for single char, '*' for zero or more chars)
    // case insensitive version in unicode.hpp
    bool like(StringData) const noexcept;

    //@{
    /// Undefined behavior if \a n, \a i, or <tt>i+n</tt> is greater than
    /// size().
    StringData prefix(size_t n) const noexcept;
    StringData suffix(size_t n) const noexcept;
    StringData substr(size_t i, size_t n) const noexcept;
    StringData substr(size_t i) const noexcept;
    //@}

    template <class C, class T>
    friend std::basic_ostream<C, T>& operator<<(std::basic_ostream<C, T>&, const StringData&);

    explicit operator bool() const noexcept;
    explicit operator std::string_view() const noexcept
    {
        return std::string_view(m_data);
    }

    /// If the StringData is NULL, the hash is 0. Otherwise, the function
    /// `murmur2_or_cityhash()` is called on the data.
    size_t hash() const noexcept;

private:
    const char* m_data;
    size_t m_size;

    static bool matchlike(const StringData& text, const StringData& pattern) noexcept;
    static bool matchlike_ins(const StringData& text, const StringData& pattern_upper,
                              const StringData& pattern_lower) noexcept;

    friend bool string_like_ins(StringData, StringData) noexcept;
    friend bool string_like_ins(StringData, StringData, StringData) noexcept;
};


// Implementation:

inline StringData::StringData() noexcept
    : m_data(nullptr)
    , m_size(0)
{
}

inline StringData::StringData(const char* external_data, size_t data_size) noexcept
    : m_data(external_data)
    , m_size(data_size)
{
    REALM_ASSERT_DEBUG(external_data || data_size == 0);
}

inline StringData::StringData(std::string_view sv)
    : m_data(sv.data())
    , m_size(sv.size())
{
}

template <class T, class A>
inline StringData::StringData(const std::basic_string<char, T, A>& s)
    : m_data(s.data())
    , m_size(s.size())
{
}

template <class T, class A>
inline StringData::operator std::basic_string<char, T, A>() const
{
    return std::basic_string<char, T, A>(m_data, m_size);
}

template <class T, class A>
inline StringData::StringData(const util::Optional<std::basic_string<char, T, A>>& s)
    : m_data(s ? s->data() : nullptr)
    , m_size(s ? s->size() : 0)
{
}

inline StringData::StringData(const null&) noexcept
    : m_data(nullptr)
    , m_size(0)
{
}

inline StringData::StringData(const char* c_str) noexcept
    : m_data(c_str)
    , m_size(0)
{
    if (c_str)
        m_size = std::char_traits<char>::length(c_str);
}

inline char StringData::operator[](size_t i) const noexcept
{
    return m_data[i];
}

inline const char* StringData::data() const noexcept
{
    return m_data;
}

inline size_t StringData::size() const noexcept
{
    return m_size;
}

inline bool StringData::is_null() const noexcept
{
    return !m_data;
}

inline bool operator==(const StringData& a, const StringData& b) noexcept
{
    return a.m_size == b.m_size && a.is_null() == b.is_null() && safe_equal(a.m_data, a.m_data + a.m_size, b.m_data);
}

inline bool operator!=(const StringData& a, const StringData& b) noexcept
{
    return !(a == b);
}

inline bool operator<(const StringData& a, const StringData& b) noexcept
{
    if (a.is_null() && !b.is_null()) {
        // Null strings are smaller than all other strings, and not
        // equal to empty strings.
        return true;
    }
    return std::lexicographical_compare(a.m_data, a.m_data + a.m_size, b.m_data, b.m_data + b.m_size);
}

inline bool operator>(const StringData& a, const StringData& b) noexcept
{
    return b < a;
}

inline bool operator<=(const StringData& a, const StringData& b) noexcept
{
    return !(b < a);
}

inline bool operator>=(const StringData& a, const StringData& b) noexcept
{
    return !(a < b);
}

inline bool StringData::begins_with(StringData d) const noexcept
{
    if (is_null() && !d.is_null())
        return false;
    return d.m_size <= m_size && safe_equal(m_data, m_data + d.m_size, d.m_data);
}

inline bool StringData::ends_with(StringData d) const noexcept
{
    if (is_null() && !d.is_null())
        return false;
    return d.m_size <= m_size && safe_equal(m_data + m_size - d.m_size, m_data + m_size, d.m_data);
}

inline bool StringData::contains(StringData d) const noexcept
{
    if (is_null() && !d.is_null())
        return false;

    return d.m_size == 0 || std::search(m_data, m_data + m_size, d.m_data, d.m_data + d.m_size) != m_data + m_size;
}

/// This method takes an array that maps chars to distance that can be moved (and zero for chars not in needle),
/// allowing the method to apply Boyer-Moore for quick substring search
/// The map is calculated in the StringNode<Contains> class (so it can be reused across searches)
inline bool StringData::contains(StringData d, const std::array<uint8_t, 256> &charmap) const noexcept
{
    if (is_null() && !d.is_null())
        return false;
    
    size_t needle_size = d.size();
    if (needle_size == 0)
        return true;
    
    // Prepare vars to avoid lookups in loop
    size_t last_char_pos = d.size()-1;
    unsigned char lastChar = d[last_char_pos];
    
    // Do Boyer-Moore search
    size_t p = last_char_pos;
    while (p < m_size) {
        unsigned char c = m_data[p]; // Get candidate for last char
        
        if (c == lastChar) {
            StringData candidate = substr(p-needle_size+1, needle_size);
            if (candidate == d)
                return true; // text found!
        }
        
        // If we don't have a match, see how far we can move char_pos
        if (charmap[c] == 0)
            p += needle_size; // char was not present in search string
        else
            p += charmap[c];
    }
    
    return false;
}
    
inline bool StringData::like(StringData d) const noexcept
{
    if (is_null() || d.is_null()) {
        return (is_null() && d.is_null());
    }

    return matchlike(*this, d);
}

inline StringData StringData::prefix(size_t n) const noexcept
{
    return substr(0, n);
}

inline StringData StringData::suffix(size_t n) const noexcept
{
    return substr(m_size - n);
}

inline StringData StringData::substr(size_t i, size_t n) const noexcept
{
    return StringData(m_data + i, n);
}

inline StringData StringData::substr(size_t i) const noexcept
{
    return substr(i, m_size - i);
}

template <class C, class T>
inline std::basic_ostream<C, T>& operator<<(std::basic_ostream<C, T>& out, const StringData& d)
{
    if (d.is_null()) {
        out << "<null>";
    }
    else {
        for (const char* i = d.m_data; i != d.m_data + d.m_size; ++i)
            out << *i;
    }
    return out;
}

inline StringData::operator bool() const noexcept
{
    return !is_null();
}

inline size_t StringData::hash() const noexcept
{
    if (is_null())
        return 0;
    auto unsigned_data = reinterpret_cast<const unsigned char*>(m_data);
    return murmur2_or_cityhash(unsigned_data, m_size);
}

} // namespace realm

namespace std {
template <>
struct hash<::realm::StringData> {
    inline size_t operator()(const ::realm::StringData& str) const noexcept
    {
        return str.hash();
    }
};
} // namespace std

#endif // REALM_STRING_HPP