sneedmc/libraries/murmur2/src/MurmurHash2.cpp

//-----------------------------------------------------------------------------
// MurmurHash2 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
//
// This was modified as to possibilitate it's usage incrementally.
// Those modifications are also placed in the public domain, and the author of
// such modifications hereby disclaims copyright to this source code.

#include "MurmurHash2.h"

//-----------------------------------------------------------------------------

// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const uint32_t m = 0x5bd1e995;
const int r = 24;

uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out)
{
    auto* buffer = new char[buffer_size];
    char data[4];

    int read = 0;
    uint32_t size = 0;

    // We need the size without the filtered out characters before actually calculating the hash,
    // to setup the initial value for the hash.
    do {
        file_stream.read(buffer, buffer_size);
        read = file_stream.gcount();
        for (int i = 0; i < read; i++) {
            if (!filter_out(buffer[i]))
                size += 1;
        }
    } while (!file_stream.eof());

    file_stream.clear();
    file_stream.seekg(0, file_stream.beg);

    int index = 0;

    // This forces a seed of 1.
    IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size };
    do {
        file_stream.read(buffer, buffer_size);
        read = file_stream.gcount();
        for (int i = 0; i < read; i++) {
            char c = buffer[i];

            if (filter_out(c))
                continue;

            data[index] = c;
            index = (index + 1) % 4;

            // Mix 4 bytes at a time into the hash
            if (index == 0)
                FourBytes_MurmurHash2((unsigned char*)&data, info);
        }
    } while (!file_stream.eof());

    // Do one last bit shuffle in the hash
    FourBytes_MurmurHash2((unsigned char*)&data, info);

    delete[] buffer;

    file_stream.close();
    return info.h;
}

void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev)
{
    if (prev.len >= 4) {
        // Not the final mix
        uint32_t k = *(uint32_t*)data;

        k *= m;
        k ^= k >> r;
        k *= m;

        prev.h *= m;
        prev.h ^= k;

        prev.len -= 4;
    } else {
        // The final mix

        // Handle the last few bytes of the input array
        switch (prev.len) {
            case 3:
                prev.h ^= data[2] << 16;
            case 2:
                prev.h ^= data[1] << 8;
            case 1:
                prev.h ^= data[0];
                prev.h *= m;
        };

        // Do a few final mixes of the hash to ensure the last few
        // bytes are well-incorporated.

        prev.h ^= prev.h >> 13;
        prev.h *= m;
        prev.h ^= prev.h >> 15;

        prev.len = 0;
    }
}

//-----------------------------------------------------------------------------