Commit 884fca7a authored by Davis King's avatar Davis King

Improved hashing tests and clarified spec.

parent 5c94f100
...@@ -23,6 +23,8 @@ namespace dlib ...@@ -23,6 +23,8 @@ namespace dlib
- Each value of seed results in a different hash function being used. - Each value of seed results in a different hash function being used.
(e.g. hash(item,0) should generally not be equal to hash(item,1)) (e.g. hash(item,0) should generally not be equal to hash(item,1))
- uses the murmur_hash3() routine to compute the actual hash. - uses the murmur_hash3() routine to compute the actual hash.
- This routine will always give the same hash value when presented
with the same input string.
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -37,11 +39,12 @@ namespace dlib ...@@ -37,11 +39,12 @@ namespace dlib
- Each value of seed results in a different hash function being used. - Each value of seed results in a different hash function being used.
(e.g. hash(item,0) should generally not be equal to hash(item,1)) (e.g. hash(item,0) should generally not be equal to hash(item,1))
- uses the murmur_hash3() routine to compute the actual hash. - uses the murmur_hash3() routine to compute the actual hash.
- Note that the returned hash value will be different on big-endian and - Note that if the memory layout of the elements in item change between
little-endian systems since hash() doesn't attempt to perform any byte hardware platforms then hash() will give different outputs. If you want
swapping of the data contained in item. If you want to always obtain hash() to always give the same output for the same input then you must
the same hash then you need to byte swap the elements of item before ensure that elements of item always have the same layout in memory.
passing it to hash(). Typically this means using fixed width types and performing byte swapping
to account for endianness before passing item to hash().
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -60,11 +63,12 @@ namespace dlib ...@@ -60,11 +63,12 @@ namespace dlib
- Each value of seed results in a different hash function being used. - Each value of seed results in a different hash function being used.
(e.g. hash(item,0) should generally not be equal to hash(item,1)) (e.g. hash(item,0) should generally not be equal to hash(item,1))
- uses the murmur_hash3() routine to compute the actual hash. - uses the murmur_hash3() routine to compute the actual hash.
- Note that the returned hash value will be different on big-endian and - Note that if the memory layout of the elements in item change between
little-endian systems since hash() doesn't attempt to perform any byte hardware platforms then hash() will give different outputs. If you want
swapping of the data contained in item. If you want to always obtain hash() to always give the same output for the same input then you must
the same hash then you need to byte swap the elements of item before ensure that elements of item always have the same layout in memory.
passing it to hash(). Typically this means using fixed width types and performing byte swapping
to account for endianness before passing item to hash().
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -83,11 +87,12 @@ namespace dlib ...@@ -83,11 +87,12 @@ namespace dlib
- Each value of seed results in a different hash function being used. - Each value of seed results in a different hash function being used.
(e.g. hash(item,0) should generally not be equal to hash(item,1)) (e.g. hash(item,0) should generally not be equal to hash(item,1))
- uses the murmur_hash3() routine to compute the actual hash. - uses the murmur_hash3() routine to compute the actual hash.
- Note that the returned hash value will be different on big-endian and - Note that if the memory layout of the elements in item change between
little-endian systems since hash() doesn't attempt to perform any byte hardware platforms then hash() will give different outputs. If you want
swapping of the data contained in item. If you want to always obtain hash() to always give the same output for the same input then you must
the same hash then you need to byte swap the elements of item before ensure that elements of item always have the same layout in memory.
passing it to hash(). Typically this means using fixed width types and performing byte swapping
to account for endianness before passing item to hash().
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -106,11 +111,14 @@ namespace dlib ...@@ -106,11 +111,14 @@ namespace dlib
- Each value of seed results in a different hash function being used. - Each value of seed results in a different hash function being used.
(e.g. hash(item,0) should generally not be equal to hash(item,1)) (e.g. hash(item,0) should generally not be equal to hash(item,1))
- uses the murmur_hash3() routine to compute the actual hash. - uses the murmur_hash3() routine to compute the actual hash.
- Note that the returned hash value will be different on big-endian and - Note that if the memory layout of the elements in item change between
little-endian systems since hash() doesn't attempt to perform any byte hardware platforms then hash() will give different outputs. If you want
swapping of the data contained in item. If you want to always obtain hash() to always give the same output for the same input then you must
the same hash then you need to byte swap the elements of item before ensure that elements of item always have the same layout in memory.
passing it to hash(). Typically this means using fixed width types and performing byte swapping
to account for endianness before passing item to hash(). However, since
you can't modify the keys in a map you may have to copy it into a
std::vector and then work from there.
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -713,11 +713,12 @@ namespace dlib ...@@ -713,11 +713,12 @@ namespace dlib
- Each value of seed results in a different hash function being used. - Each value of seed results in a different hash function being used.
(e.g. hash(item,0) should generally not be equal to hash(item,1)) (e.g. hash(item,0) should generally not be equal to hash(item,1))
- uses the murmur_hash3() routine to compute the actual hash. - uses the murmur_hash3() routine to compute the actual hash.
- Note that the returned hash value will be different on big-endian and - Note that if the memory layout of the elements in item change between
little-endian systems since hash() doesn't attempt to perform any byte hardware platforms then hash() will give different outputs. If you want
swapping of the data contained in item. If you want to always obtain hash() to always give the same output for the same input then you must
the same hash then you need to byte swap the elements of item before ensure that elements of item always have the same layout in memory.
passing it to hash(). Typically this means using fixed width types and performing byte swapping
to account for endianness before passing item to hash().
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <ctime> #include <ctime>
#include <dlib/hash.h> #include <dlib/hash.h>
#include <dlib/matrix.h> #include <dlib/matrix.h>
#include <dlib/byte_orderer.h>
#include "tester.h" #include "tester.h"
...@@ -19,6 +20,32 @@ namespace ...@@ -19,6 +20,32 @@ namespace
logger dlog("test.hash"); logger dlog("test.hash");
template <typename T>
void to_little (
std::vector<T>& item
)
{
byte_orderer bo;
for (unsigned long i = 0; i < item.size(); ++i)
bo.host_to_little(item[i]);
}
template <typename T>
void to_little (
matrix<T>& item
)
{
byte_orderer bo;
for (long r = 0; r < item.nr(); ++r)
{
for (long c = 0; c < item.nc(); ++c)
{
bo.host_to_little(item(r,c));
}
}
}
class test_hash : public tester class test_hash : public tester
{ {
public: public:
...@@ -31,47 +58,64 @@ namespace ...@@ -31,47 +58,64 @@ namespace
void perform_test ( void perform_test (
) )
{ {
print_spinner();
std::string str1 = "some random string"; std::string str1 = "some random string";
std::wstring str2 = L"another String!";
matrix<unsigned char> mat(2,2); matrix<unsigned char> mat(2,2);
mat = 1,2,3,4; mat = 1,2,3,4;
matrix<uint64> mat2(2,3);
mat2 = 1,2,3,4,5,6;
to_little(mat2);
std::vector<unsigned char> v(4); std::vector<unsigned char> v(4);
v[0] = 'c'; v[0] = 'c';
v[1] = 'a'; v[1] = 'a';
v[2] = 't'; v[2] = 't';
v[3] = '!'; v[3] = '!';
std::vector<uint16> v2(4);
v[0] = 'c';
v[1] = 'a';
v[2] = 't';
v[3] = '!';
to_little(v2);
std::map<unsigned char, unsigned char> m; std::map<unsigned char, unsigned char> m;
m['c'] = 'C'; m['c'] = 'C';
m['a'] = 'A'; m['a'] = 'A';
m['t'] = 'T'; m['t'] = 'T';
dlog << LINFO << "hash(str1): "<< hash(str1); dlog << LINFO << "hash(str1): "<< hash(str1);
dlog << LINFO << "hash(str2): "<< hash(str2);
dlog << LINFO << "hash(v): "<< hash(v); dlog << LINFO << "hash(v): "<< hash(v);
dlog << LINFO << "hash(v2): "<< hash(v2);
dlog << LINFO << "hash(m): "<< hash(m); dlog << LINFO << "hash(m): "<< hash(m);
dlog << LINFO << "hash(mat): "<< hash(mat); dlog << LINFO << "hash(mat): "<< hash(mat);
dlog << LINFO << "hash(mat2): "<< hash(mat2);
DLIB_TEST(hash(str1) == 1073638390); DLIB_TEST(hash(str1) == 1073638390);
DLIB_TEST(hash(str2) == 2413364589);
DLIB_TEST(hash(v) == 4054789286); DLIB_TEST(hash(v) == 4054789286);
DLIB_TEST(hash(v2) == 1669671676);
DLIB_TEST(hash(m) == 2865512303); DLIB_TEST(hash(m) == 2865512303);
DLIB_TEST(hash(mat) == 1043635621); DLIB_TEST(hash(mat) == 1043635621);
DLIB_TEST(hash(mat2) == 982899794);
DLIB_TEST(murmur_hash3(&str1[0], str1.size(), 0) == 1073638390); DLIB_TEST(murmur_hash3(&str1[0], str1.size(), 0) == 1073638390);
dlog << LINFO << "hash(str1,1): "<< hash(str1,1); dlog << LINFO << "hash(str1,1): "<< hash(str1,1);
dlog << LINFO << "hash(str2,2): "<< hash(str2,2);
dlog << LINFO << "hash(v,3): "<< hash(v,3); dlog << LINFO << "hash(v,3): "<< hash(v,3);
dlog << LINFO << "hash(v2,3): "<< hash(v2,3);
dlog << LINFO << "hash(m,4): "<< hash(m,4); dlog << LINFO << "hash(m,4): "<< hash(m,4);
dlog << LINFO << "hash(mat,5): "<< hash(mat,5); dlog << LINFO << "hash(mat,5): "<< hash(mat,5);
dlog << LINFO << "hash(mat2,6): "<< hash(mat2,6);
DLIB_TEST(hash(str1,1) == 2977753747); DLIB_TEST(hash(str1,1) == 2977753747);
DLIB_TEST(hash(str2,2) == 3656927287);
DLIB_TEST(hash(v,3) == 2127112268); DLIB_TEST(hash(v,3) == 2127112268);
DLIB_TEST(hash(v2,3) == 2999850111);
DLIB_TEST(hash(m,4) == 4200495810); DLIB_TEST(hash(m,4) == 4200495810);
DLIB_TEST(hash(mat,5) == 2380427865); DLIB_TEST(hash(mat,5) == 2380427865);
DLIB_TEST(hash(mat2,6) == 3098179348 );
DLIB_TEST(murmur_hash3(&str1[0], str1.size(), 1) == 2977753747); DLIB_TEST(murmur_hash3(&str1[0], str1.size(), 1) == 2977753747);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment