Improved hashing tests and clarified spec.

884fca7a · Davis King · 5c94f100 · 884fca7a · 884fca7a · 884fca7a
Commit 884fca7a authored May 28, 2011 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 83 additions and 30 deletions

hash_abstract.h dlib/general_hash/hash_abstract.h +28 -20

matrix_utilities_abstract.h dlib/matrix/matrix_utilities_abstract.h +6 -5

hash.cpp dlib/test/hash.cpp +49 -5

No files found.
--- a/dlib/general_hash/hash_abstract.h
+++ b/dlib/general_hash/hash_abstract.h
@@ -23,6 +23,8 @@ namespace dlib
            - Each value of seed results in a different hash function being used.  
              (e.g. hash(item,0) should generally not be equal to hash(item,1))
            - uses the murmur_hash3() routine to compute the actual hash.
+            - This routine will always give the same hash value when presented
+              with the same input string.
    !*/

 // ----------------------------------------------------------------------------------------
@@ -37,11 +39,12 @@ namespace dlib
            - Each value of seed results in a different hash function being used.  
              (e.g. hash(item,0) should generally not be equal to hash(item,1))
            - uses the murmur_hash3() routine to compute the actual hash.
-            - Note that the returned hash value will be different on big-endian and 
-              little-endian systems since hash() doesn't attempt to perform any byte 
-              swapping of the data contained in item.  If you want to always obtain 
-              the same hash then you need to byte swap the elements of item before 
-              passing it to hash().
+            - Note that if the memory layout of the elements in item change between
+              hardware platforms then hash() will give different outputs.  If you want
+              hash() to always give the same output for the same input then you must 
+              ensure that elements of item always have the same layout in memory.
+              Typically this means using fixed width types and performing byte swapping
+              to account for endianness before passing item to hash().
    !*/

 // ----------------------------------------------------------------------------------------
@@ -60,11 +63,12 @@ namespace dlib
            - Each value of seed results in a different hash function being used.  
              (e.g. hash(item,0) should generally not be equal to hash(item,1))
            - uses the murmur_hash3() routine to compute the actual hash.
-            - Note that the returned hash value will be different on big-endian and 
-              little-endian systems since hash() doesn't attempt to perform any byte 
-              swapping of the data contained in item.  If you want to always obtain 
-              the same hash then you need to byte swap the elements of item before 
-              passing it to hash().
+            - Note that if the memory layout of the elements in item change between
+              hardware platforms then hash() will give different outputs.  If you want
+              hash() to always give the same output for the same input then you must 
+              ensure that elements of item always have the same layout in memory.
+              Typically this means using fixed width types and performing byte swapping
+              to account for endianness before passing item to hash().
    !*/

 // ----------------------------------------------------------------------------------------
@@ -83,11 +87,12 @@ namespace dlib
            - Each value of seed results in a different hash function being used.  
              (e.g. hash(item,0) should generally not be equal to hash(item,1))
            - uses the murmur_hash3() routine to compute the actual hash.
-            - Note that the returned hash value will be different on big-endian and 
-              little-endian systems since hash() doesn't attempt to perform any byte 
-              swapping of the data contained in item.  If you want to always obtain 
-              the same hash then you need to byte swap the elements of item before 
-              passing it to hash().
+            - Note that if the memory layout of the elements in item change between
+              hardware platforms then hash() will give different outputs.  If you want
+              hash() to always give the same output for the same input then you must 
+              ensure that elements of item always have the same layout in memory.
+              Typically this means using fixed width types and performing byte swapping
+              to account for endianness before passing item to hash().
    !*/

 // ----------------------------------------------------------------------------------------
@@ -106,11 +111,14 @@ namespace dlib
            - Each value of seed results in a different hash function being used.  
              (e.g. hash(item,0) should generally not be equal to hash(item,1))
            - uses the murmur_hash3() routine to compute the actual hash.
-            - Note that the returned hash value will be different on big-endian and 
-              little-endian systems since hash() doesn't attempt to perform any byte 
-              swapping of the data contained in item.  If you want to always obtain 
-              the same hash then you need to byte swap the elements of item before 
-              passing it to hash().
+            - Note that if the memory layout of the elements in item change between
+              hardware platforms then hash() will give different outputs.  If you want
+              hash() to always give the same output for the same input then you must 
+              ensure that elements of item always have the same layout in memory.
+              Typically this means using fixed width types and performing byte swapping
+              to account for endianness before passing item to hash().  However, since
+              you can't modify the keys in a map you may have to copy it into a 
+              std::vector and then work from there.
    !*/

 // ----------------------------------------------------------------------------------------

--- a/dlib/matrix/matrix_utilities_abstract.h
+++ b/dlib/matrix/matrix_utilities_abstract.h
@@ -713,11 +713,12 @@ namespace dlib
            - Each value of seed results in a different hash function being used.  
              (e.g. hash(item,0) should generally not be equal to hash(item,1))
            - uses the murmur_hash3() routine to compute the actual hash.
-            - Note that the returned hash value will be different on big-endian and 
-              little-endian systems since hash() doesn't attempt to perform any byte 
-              swapping of the data contained in item.  If you want to always obtain 
-              the same hash then you need to byte swap the elements of item before 
-              passing it to hash().
+            - Note that if the memory layout of the elements in item change between
+              hardware platforms then hash() will give different outputs.  If you want
+              hash() to always give the same output for the same input then you must 
+              ensure that elements of item always have the same layout in memory.
+              Typically this means using fixed width types and performing byte swapping
+              to account for endianness before passing item to hash().
    !*/

 // ----------------------------------------------------------------------------------------

--- a/dlib/test/hash.cpp
+++ b/dlib/test/hash.cpp
@@ -7,6 +7,7 @@
 #include <ctime>
 #include <dlib/hash.h>
 #include <dlib/matrix.h>
+#include <dlib/byte_orderer.h>

 #include "tester.h"

@@ -19,6 +20,32 @@ namespace
    logger dlog("test.hash");


+    template <typename T>
+    void to_little (
+        std::vector<T>& item
+    )
+    {
+        byte_orderer bo;
+        for (unsigned long i = 0; i < item.size(); ++i)
+            bo.host_to_little(item[i]);
+    }
+
+
+    template <typename T>
+    void to_little (
+        matrix<T>& item
+    )
+    {
+        byte_orderer bo;
+        for (long r = 0; r < item.nr(); ++r)
+        {
+            for (long c = 0; c < item.nc(); ++c)
+            {
+                bo.host_to_little(item(r,c));
+            }
+        }
+    }
+
    class test_hash : public tester
    {
    public:
@@ -31,47 +58,64 @@ namespace
        void perform_test (
        )
        {
+            print_spinner();
            std::string str1 = "some random string";
-            std::wstring str2 = L"another String!";
            matrix<unsigned char> mat(2,2);

            mat = 1,2,3,4;

+            matrix<uint64> mat2(2,3);
+
+            mat2 = 1,2,3,4,5,6;
+
+            to_little(mat2);
+
            std::vector<unsigned char> v(4);
            v[0] = 'c';
            v[1] = 'a';
            v[2] = 't';
            v[3] = '!';

+            std::vector<uint16> v2(4);
+            v[0] = 'c';
+            v[1] = 'a';
+            v[2] = 't';
+            v[3] = '!';
+            to_little(v2);
+
            std::map<unsigned char, unsigned char> m;
            m['c'] = 'C';
            m['a'] = 'A';
            m['t'] = 'T';

            dlog << LINFO << "hash(str1): "<< hash(str1);
-            dlog << LINFO << "hash(str2): "<< hash(str2);
            dlog << LINFO << "hash(v): "<< hash(v);
+            dlog << LINFO << "hash(v2): "<< hash(v2);
            dlog << LINFO << "hash(m): "<< hash(m);
            dlog << LINFO << "hash(mat): "<< hash(mat);
+            dlog << LINFO << "hash(mat2): "<< hash(mat2);

            DLIB_TEST(hash(str1) == 1073638390);
-            DLIB_TEST(hash(str2) == 2413364589);
            DLIB_TEST(hash(v) == 4054789286);
+            DLIB_TEST(hash(v2) == 1669671676);
            DLIB_TEST(hash(m) == 2865512303);
            DLIB_TEST(hash(mat) == 1043635621);
+            DLIB_TEST(hash(mat2) == 982899794);
            DLIB_TEST(murmur_hash3(&str1[0], str1.size(), 0) == 1073638390);

            dlog << LINFO << "hash(str1,1): "<< hash(str1,1);
-            dlog << LINFO << "hash(str2,2): "<< hash(str2,2);
            dlog << LINFO << "hash(v,3): "<< hash(v,3);
+            dlog << LINFO << "hash(v2,3): "<< hash(v2,3);
            dlog << LINFO << "hash(m,4): "<< hash(m,4);
            dlog << LINFO << "hash(mat,5): "<< hash(mat,5);
+            dlog << LINFO << "hash(mat2,6): "<< hash(mat2,6);

            DLIB_TEST(hash(str1,1) == 2977753747);
-            DLIB_TEST(hash(str2,2) == 3656927287);
            DLIB_TEST(hash(v,3) == 2127112268);
+            DLIB_TEST(hash(v2,3) == 2999850111);
            DLIB_TEST(hash(m,4) == 4200495810);
            DLIB_TEST(hash(mat,5) == 2380427865);
+            DLIB_TEST(hash(mat2,6) == 3098179348 );
            DLIB_TEST(murmur_hash3(&str1[0], str1.size(), 1) == 2977753747);

        }