Perfect Hashing: FKS Method: Variations and an Implementation

[This page is auto-generated; please prefer to read the article’s PDF Form.]

Variations and an Implementation

Note that h₁ and h_2,i can be searched in any 𝜖-Universal classes. In above discussion, we just considered classes with 𝜖 = 1∕TableSize, for both level-1 and level-2 hashing. If we use any other 𝜖-Universal class for level-2 hashing, then the condition of Theorem 1 in [3] needs to be evaluated with the corresponding 𝜖, to ﬁnd the new condition which ensures certain collision-free functions. Similarly, if we use any other 𝜖-Universal class for level-1 hashing, the Corollary 4 in [3] may give a diﬀerent upper-bound for the sum ∑ i∈ℤm s_i². The size of table T would be dependent upon this sum, similar to what we saw above.

The following source code in C demonstrates an implementation of this method for keys of type “unsigned int”. The universal class used for selecting h₁ is from example-1 mentioned in page-2 of [3], with m = n. The same class type with same p but m = m_i = si(si − 1) is used for selecting functions h_2,i. Depending upon the requirement (e.g. diﬀerent key type), an implementation can choose diﬀerent universal classes for level-1 and level-2 functions.

Any symbol of the form ‘a_b’ from the above discussion has been shown as ‘a{b}’ in the code comments.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

#define N 100           /* size of the set S of static-keys */

/* the keys can be any 32-bit integer, so choosing ‘p’ as the smallest prime
   greater than 2^32-1 */
#define P 4294967311    /* 2^32 + 15 */

typedef unsigned int uint;

typedef struct object
{
  uint id;
  /* other data */
} object;

typedef struct hashtable
{
  uint   m;
  uint   a1, b1;                     /* for function h{1} */
  uint   a2[N], b2[N];            /* for functions h{2,i} */
  object **T;
  int    T_offset[N+1];
  int    T_size;
} hashtable;

/* "id" of these N objects are all distinct, forming the static-keys set */
object    objects[N];
hashtable ht;
int       *ht_s;

/* function: h(k) = ((ak+b) mod p) mod TableSize */
#define hash(k, a, b, tsize) (((((unsigned long)a)*k + b) %(P)) %(tsize))

void construct_hashtable()
{
  int i;
  /* the size of each subset S{i} belonging to a level-1 slot; required
     only during this method. */
  int s[N];

  srand(time(NULL));
  ht_s = s;
  ht.m = N;

  /* search for level-1 hash-function h{1} */
  search_h1();

  /* offsets of tables T{i} in T; returns sum of T{i} sizes */
  ht.T_size = calculate_offsets();

  ht.T = malloc(ht.T_size*sizeof(object *));
  memset(ht.T, 0, ht.T_size*sizeof(object *));

  /* distribute input keys among subsets S{i}, using hash-function h{1} */
  create_subsets_of_keys();

  for(i = 0; i < ht.m; i++)
  {
    if(ht_s[i] < 2)
      continue;

    /* search for level-2 hash-function h{2,i} */
    search_h2i(i);

    /* hash subset S{i} to hashtable T{i}, using hash-function h{2,i} */
    hash_subset_of_keys(i);
  }
}

void search_h1()
{
  int a, b;

  while(1)
  {
    a = 1 + rand();
    b = rand();

    /* check if the randomly selected function satisfies the upper-bound
       3N on the sum of s{i}*s{i} */
    if(check_si_sqr_sum(a, b))
      break;
  }

  ht.a1 = a;
  ht.b1 = b;
}

int check_si_sqr_sum(uint a, uint b)
{
  int i, h;
  int si_sqr_sum = 0;

  memset(ht_s, 0, N*sizeof(int));

  for(i = 0; i < N; i++)
  {
    h = hash(objects[i].id, a, b, ht.m);
    ht_s[h]++;
  }

  for(i = 0; i < N; i++)
    si_sqr_sum += ht_s[i]*ht_s[i];

  return si_sqr_sum < 3*N;
}

int calculate_offsets()
{
  int i;
  int lastsize = 0;

  ht.T_offset[0] = 0;

  for(i = 0; i < N; i++)
  {
    if(i > 0)
      ht.T_offset[i] = ht.T_offset[i-1] + lastsize;

    if(ht_s[i] > 1)
      lastsize = ht_s[i]*(ht_s[i]-1);
    else
      lastsize = ht_s[i];
  }

  /* extra element T_offset[N] is for simplifying macro SIZE_T() */
  ht.T_offset[N] = ht.T_offset[N-1] + lastsize;

  return ht.T_offset[N];
}

void create_subsets_of_keys()
{
  int i, h1, offset;
  int current_size[N];

  memset(current_size, 0, sizeof(current_size));

  /* we can use the initial part of hashtable T{i} to store subset S{i},
     till we hash its keys using level-2 hashing. */
  for(i = 0; i < N; i++)
  {
    h1 = hash(objects[i].id, ht.a1, ht.b1, ht.m);

    offset = ht.T_offset[h1];

    ht.T[offset + current_size[h1]] = &objects[i];

    current_size[h1]++;
  }
}

void search_h2i(int i)
{
  int a, b;
  int si = ht_s[i];
  int offset = ht.T_offset[i];
  int tsize = si*(si-1);

  /* the bitmap required for checking collision-free function */
  char *buffer;
  int  buffer_size = (tsize/8) + 1;
  char tmp;

  /* Each subset S{i} is first stored in table T{i} (which is in T) at its
     initial "si" elements, by create_subsets_of_keys(). Its keys are later
     hashed by h{2,i} among the "si*(si-1)" elements of T{i} (if si >= 2).

     For searching collision-free function for S{i}, we need a bitmap of
     size tsize. For this, we utilize the CURRENTLY unused space of T{i},
     which is after the index "offset+si-1". But for si=2, si*(si-1)=si,
     so there is no such unused space. */

  if(si == 2)
    buffer = &tmp;
  else
    buffer = (char*) &(ht.T[offset+si]);

  while(1)
  {
    memset(buffer, 0, buffer_size);

    a = 1 + rand();
    b = rand();

    /* check if the randomly selected function is collision-free for
       subset S{i} */
    if(check_collision_free(i, a, b, buffer))
      break;
  }

  ht.a2[i] = a;
  ht.b2[i] = b;

  if(si != 2)
    memset(buffer, 0, buffer_size);
}

#define CHECK_BIT(arr, i) (arr[i/8] & (1 << (i%8)))
#define SET_BIT(arr, i)   (arr[i/8] |= (1 << (i%8)))

int check_collision_free(int l1slot, uint a, uint b, char *buffer)
{
  int i, h;
  int offset = ht.T_offset[l1slot];
  int si = ht_s[l1slot];

  for(i = offset; i < offset + si; i++)
  {
    h = hash(ht.T[i]->id, a, b, (si*(si-1)));

    if(CHECK_BIT(buffer, h))
      return 0;

    SET_BIT(buffer, h);
  }

  return 1;
}

void hash_subset_of_keys(int l1slot)
{
  int    i, h2, target;
  int    offset = ht.T_offset[l1slot];
  int    si = ht_s[l1slot];
  uint   a2 = ht.a2[l1slot], b2 = ht.b2[l1slot];
  object *obj, *existing;

  for(i = offset; i < offset + si; i++)
  {
    obj = ht.T[i];
    ht.T[i] = NULL;

    h2 = hash(obj->id, a2, b2, (si*(si-1)));

    target = offset + h2;
    existing = ht.T[target];
    ht.T[target] = obj;

    /* We had used the initial part of hashtable T{i} to store subset S{i},
       and now we are hashing its keys across T{i}. So the "target" index
       may contain a not-yet-hashed key of S{i}. */
    if(existing)
    {
      ht.T[i] = existing;
      i--;
    }
  }
}

/* size of T{i} can be inferred from offsets */
#define SIZE_T(i) (ht.T_offset[i+1] - ht.T_offset[i])

object *lookup(uint k)
{
  int    h1, h2, tsize, index;
  object *obj;

  h1 = hash(k, ht.a1, ht.b1, ht.m);

  tsize = SIZE_T(h1);

  if(tsize == 0)
    return NULL;

  index = ht.T_offset[h1];

  /* no level-2 hashing was done for tsize=1 */
  if(tsize >= 2)
  {
    h2 = hash(k, ht.a2[h1], ht.b2[h1], tsize);
    index = index + h2;
  }

  obj = ht.T[index];

  if(obj == NULL || obj->id != k)
    return NULL;

  return obj;
}

Variations and an Implementation

References