// +-----------------------------------------------------------------------+
// |                                                                       |
// | Construct a LB-tree for NN search by using agglomerative clustering.  |
// |                                                                       |
// + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +
// |                                                                       |
// | Author: Yong-Sheng Chen (yschen@iis.sinica.edu.tw)      10/10/00      |
// |         Institute of Information Science                              |
// |         Academia Sinica, Taipei, Taiwan                               |
// |                                                                       |
// +-----------------------------------------------------------------------+

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/timeb.h>
#include "cluster.h"

#define NUM_TOPCLUSTERS 20  // number of clusters at the first level

typedef struct { 
  double dmax;              // maximum distance between clusters p1 and p2
  MEAN_NODE *p1;
  MEAN_NODE *p2; 
} CL_NODE;                  // compare list node

MEAN_NODE *root=NULL;       // root of the tree structure
double *pointsdb;           // pointer to the data set of sample points
double *smean;              // storage for temporary mean when clustering
int dim;                    // dimension of the sample point
int pnum;                   // number of sample points

double *read_points(char *, int *, int *);
void transform(double *, double *);
void initialize_transform(int);

extern int maxLevel;        // maximum level of the tree
extern int *mdim;           // dimensions at each level
extern double *rthresh;     // radius threshold at each level


// compare procedure for qsort
static int point1Dcompare(const void *i, const void *j)
{
   MEAN_NODE **p, **q;

   p=(MEAN_NODE **)i;
   q=(MEAN_NODE **)j;
   if (*((*p)->data) > *((*q)->data))
      return (1);
   if (*((*p)->data) < *((*q)->data))
      return (-1);
   return (0);
}


// compare procedure for qsort
static int dmaxcompare(const void *i, const void *j)
{
   CL_NODE *p, *q;

   p=(CL_NODE *)i;
   q=(CL_NODE *)j;
   if (p->dmax > q->dmax)
      return (1);
   if (p->dmax < q->dmax)
      return (-1);
   return (0);
}


// calculate the max. dist. for all pairs of points among two clusters
double calc_dmax(int ndim, MEAN_NODE *p1, MEAN_NODE *p2)
{
   int i;
   double dmax=0, tmpd, tmpv;
   MEAN_NODE *mnp;

   for (; p1!=NULL; p1=p1->sibling)
   for ( mnp=p2; mnp!=NULL; mnp=mnp->sibling)
   {
      for (i=0, tmpd=0; i<ndim; i++)
      {
         tmpv = p1->data[i]-mnp->data[i];
         tmpd += tmpv*tmpv;
      }
      if (tmpd>dmax)
         dmax=tmpd;
   }

   return (sqrt(dmax));
}


// calculate mean and radius covering all nodes of two clusters
void calc_mean_r(int ndim, MEAN_NODE *p1, MEAN_NODE *p2, double *mean,
		 double *radius)
{
   MEAN_NODE *mnp;
   int i, count;
   double maxd, tmpd, tmpv;

   // calculate mean for all the nodes of two clusters
   for (i=0; i<ndim; i++)
      mean[i]=0.0;
   count=0;
   for (mnp=p1; mnp!=NULL; mnp=mnp->sibling)
   {
      count++;
      for (i=0; i<ndim; i++)
	 mean[i] += mnp->data[i];
   }
   for (mnp=p2; mnp!=NULL; mnp=mnp->sibling)
   {
      count++;
      for (i=0; i<ndim; i++)
	 mean[i] += mnp->data[i];
   }
   for (i=0; i<ndim; i++)
      mean[i]/=count;

   // calculate max. distance between nodes and the mean (radius)
   maxd=0;
   for (mnp=p1; mnp!=NULL; mnp=mnp->sibling)
   {
      for (i=0, tmpd=0; i<ndim; i++)
      {
         tmpv = mnp->data[i]-mean[i];
         tmpd += tmpv*tmpv;
      }
      if (tmpd>maxd)
	 maxd=tmpd;
   }
   for (mnp=p2; mnp!=NULL; mnp=mnp->sibling)
   {
      for (i=0, tmpd=0; i<ndim; i++)
      {
         tmpv = mnp->data[i]-mean[i];
         tmpd += tmpv*tmpv;
      }
      if (tmpd>maxd)
	 maxd=tmpd;
   }

   *radius=sqrt(maxd);
}


// agglomeratively merge clusters until a fixed amount of clusters is achieved
// in 1D case and return a radius threshold
MEAN_NODE *merge_1D(MEAN_NODE *head, double *Tradius)
{
   MEAN_NODE *mnp1, *mnp2, *tmp1, *tmp2;
   int nlist, ncl, i, j, k;
   CL_NODE *clhead, *clp;
   MEAN_NODE **cphead, **cpp;
   double dmax;


   if (head->ndim != 1)
   {
      fprintf(stderr, "merge_1D(): only one dimension is allowd!\n");
      exit(1);
   }

   // calculate #element in the list
   for (mnp1=head, nlist=0; mnp1!=NULL; mnp1=mnp1->sibling, nlist++);

   ncl = nlist-1;  // only neighbors need to be considered
   clhead = (CL_NODE *)malloc(sizeof(CL_NODE)*ncl);
   cphead = (MEAN_NODE **)malloc(sizeof(MEAN_NODE *)*nlist);
   if (clhead==NULL || cphead==NULL)
   {
      fprintf(stderr, "merge_1D(): malloc error.\n");
      exit(1);
   }

   // sort the nodes according to the first coordinate and stores in cphead
   for (mnp1=head, cpp=cphead; mnp1!=NULL; mnp1=mnp1->sibling, cpp++)
      *cpp=mnp1;
   qsort(cphead, nlist, sizeof(MEAN_NODE *), point1Dcompare);

   // calculate dmax in 1D for all neighboring pairs
   for (i=0; i<ncl; i++)
   {
      clhead[i].dmax=cphead[i+1]->data[0]-cphead[i]->data[0];
      clhead[i].p1=cphead[i];
      clhead[i].p2=cphead[i+1];
   }

   // sort the clhead list according to dmax
   qsort(clhead, ncl, sizeof(CL_NODE), dmaxcompare);

   // iteratively check the pair with minimum dmax
   clp=clhead;
   while (nlist > NUM_TOPCLUSTERS)
   // merging two clusters with minimum dmax to reduce one cluster
   {
      // calculate the mean and radius for two clusters if they are merged
      calc_mean_r(1, clp[0].p1->child, clp[0].p2->child, smean, Tradius);

      // merge these two clusters (p1 and p2) and save to p1
      clp[0].p1->data[0]=smean[0];
      clp[0].p1->radius=*Tradius;
      // catenate the children of p1 and p2
      for (mnp1=clp[0].p1->child; mnp1->sibling!=NULL; mnp1=mnp1->sibling);
      mnp1->sibling=clp[0].p2->child;

      // delete p2 in cphead and free its storage
      mnp2=clp[0].p2;
      free(mnp2->data);
      free(mnp2);
      for (j=0; cphead[j]!=mnp2; j++);
      nlist--;
      while (j<nlist)
      {
         cphead[j]=cphead[j+1];
         j++;
      }

      mnp1=clp[0].p1;

      // delete clp[0] in cl list due to the mergence of p1 and p2
      ncl--;
      clp++;

      // replace dmax of p1 and p2 with "p1 union p2" in cl list
      for (j=0; j<ncl; j++)
         if (clp[j].p2==mnp1)
         {
            tmp1=clp[j].p1;
            tmp2=clp[j].p2;
            dmax=calc_dmax(1, tmp1->child, tmp2->child);

	    // rearrange list according to the enlarged dmax
            for (k=j; k<ncl-1 && dmax>clp[k+1].dmax; k++)
               memcpy(&clp[k], &clp[k+1], sizeof(CL_NODE));
            clp[k].p1=tmp1;
            clp[k].p2=tmp2;
            clp[k].dmax=dmax;
            break;   // there will be only one pair with p2==mnp1
         }
      for (j=0; j<ncl; j++)
	 if (clp[j].p1==mnp2)
	 {
            tmp1=mnp1;  // replace mnp2 to mnp1
            tmp2=clp[j].p2;
	    dmax=calc_dmax(1, tmp1->child, tmp2->child);

	    // rearrange list according to the enlarged dmax
	    for (k=j; k<ncl-1 && dmax>clp[k+1].dmax; k++)
	       memcpy(&clp[k], &clp[k+1], sizeof(CL_NODE));
	    clp[k].p1=tmp1;
	    clp[k].p2=tmp2;
	    clp[k].dmax=dmax;
	    break;   // there will be only one pair with p1==mnp2
	 }
   }

   // link the list again by using cphead
   for (i=0, head=NULL; i<nlist; i++)
   {
      cphead[i]->sibling=head;
      head=cphead[i];
   }

   free(clhead);
   free(cphead);

   return(head);
}


// agglomeratively merge clusters when their dmax is smaller than Tradius
MEAN_NODE *merge(MEAN_NODE *head, double Tradius)
{
   MEAN_NODE *mnp1, *mnp2, *tmp1, *tmp2;
   int nlist, ncl, i, j, k, ndim;
   CL_NODE *clhead, *clp;
   double radius, dmax;

   ndim=head->ndim;

   // calculate #element in the list
   for (mnp1=head, nlist=0; mnp1!=NULL; mnp1=mnp1->sibling, nlist++);

//   printf("%d: #element reduced from %d ",ndim, nlist);

   ncl = nlist*(nlist-1)/2;      // ncl = C(nlist,2), distances for all pairs
   clhead = (CL_NODE *)malloc(sizeof(CL_NODE)*ncl);
   if (clhead==NULL)
   {
      fprintf(stderr, "merge(): malloc error.\n");
      exit(1);
   }

   // calculate dmax for all pairs
   clp=clhead;
   for (mnp1=head; mnp1->sibling!=NULL; mnp1=mnp1->sibling)
   for (mnp2=mnp1->sibling; mnp2!=NULL; mnp2=mnp2->sibling)
   {
      clp->dmax=calc_dmax(ndim, mnp1->child, mnp2->child);
      clp->p1=mnp1;
      clp->p2=mnp2;
      clp++;
   }

   // sort the list according to dmax
   qsort(clhead, ncl, sizeof(CL_NODE), dmaxcompare);

   // check each pair from the one with minimum dmax
   i=0;
   while (i<ncl && clhead[i].dmax<2*Tradius) 
   // merging two clusters is possible only when their dmax distance is 
   // smaller than 2*threshold
   {
      // calculate the mean and radius for two clusters if they are merged
      calc_mean_r(ndim, clhead[i].p1->child,clhead[i].p2->child,smean,&radius);

      if (radius<Tradius)   // fulfill the radius constraint
      {
	 // actually merge these two clusters (p1 and p2) and save to p1
	 memcpy(clhead[i].p1->data, smean, sizeof(double)*ndim);
	 clhead[i].p1->radius=radius;
         // catenate the children of p1 and p2
	 for (mnp1=clhead[i].p1->child;mnp1->sibling!=NULL;mnp1=mnp1->sibling);
	 mnp1->sibling=clhead[i].p2->child;

	 //delete p2 in the linked list
	 mnp2=clhead[i].p2;
	 for (mnp1=head; mnp1->sibling!=mnp2; mnp1=mnp1->sibling);
	 mnp1->sibling=mnp2->sibling;
	 free(mnp2->data);
	 free(mnp2);

	 mnp1=clhead[i].p1;

	 // delete all nodes containing p2 in cl list
	 for (j=0; clhead[j].p1!=mnp2 && clhead[j].p2!=mnp2; j++); // first one
         if (j<i)
            i=j;   // update the current checking position
	 for (k=j+1; k<ncl; k++)
	 {
	    if (clhead[k].p1!=mnp2 && clhead[k].p2!=mnp2)
	    {
	       memcpy(&clhead[j], &clhead[k], sizeof(CL_NODE));
	       j++;
	    }
	 }
	 ncl=j;

	 // replace dmax of p1 with "p1 union p2" in cl list
         for (j=0; j<ncl; j++)
            if (clhead[j].p1==mnp1 || clhead[j].p2==mnp1)
            {
               clhead[j].dmax=calc_dmax(ndim, clhead[j].p1->child, 
	                                      clhead[j].p2->child);
               if (j<i)
                  i=j;  // update the current checking position
            }

  	 // rearrange list by using insertion sort (dmax is enlarged)
         for (j=ncl-2; j>=0; j--)
            if (clhead[j].p1==mnp1 || clhead[j].p2==mnp1)
            {
	       tmp1=clhead[j].p1;
	       tmp2=clhead[j].p2;
	       dmax=clhead[j].dmax;
               for (k=j; k<ncl-1 && dmax>clhead[k+1].dmax; k++)
                  memcpy(&clhead[k], &clhead[k+1], sizeof(CL_NODE));
               clhead[k].p1=tmp1;
               clhead[k].p2=tmp2;
               clhead[k].dmax=dmax;
            }
      }
      else
	 i++;
   }

   free(clhead);

   return(head);
}


// cluster the sample points at each level in a recursive way
MEAN_NODE *clustering(MEAN_NODE *parent, int level)
{
   int ndim;
   MEAN_NODE *head, *mnp;

   //intializing
   ndim=mdim[level];   // set the dimension at current level

   head=NULL;
   while (parent!=NULL)
   {
      // generate a new MEAN NODE at current level and link them into a list
      mnp = (MEAN_NODE *)malloc(sizeof(MEAN_NODE));
      mnp->ndim=ndim;
      mnp->data=(double *)malloc(sizeof(double)*ndim);
      memcpy(mnp->data, parent->data, sizeof(double)*ndim);
      mnp->radius=0;
      mnp->sibling=head;
      head=mnp;
      mnp->child=parent;
      parent=parent->sibling;
      mnp->child->sibling=NULL;
   }

   // merge the elements in the list according to the radius threshold
   if (level==0)   // special method for level 0 because there is only one dim.
   {
      double rthresh_org;

      head=merge_1D(head, &rthresh_org);
      initialize_rthresh(rthresh_org);
   }
   else            // general method for other levels higher than 0
      head=merge(head, rthresh[level]);

   //further clustering at next level for each cluster at current level
   level++;
   if (level<maxLevel)
   {
      for (mnp=head; mnp!=NULL; mnp=mnp->sibling)
	 mnp->child=clustering(mnp->child, level);
   }

   return(head);
}


// calculate #mean nodes and the associated dimension of data
int calc_nsize(MEAN_NODE *start, int *dsize)
{
   MEAN_NODE *mnp;
   int nsize=0, cdsize;

   for (mnp=start; mnp!=NULL; mnp=mnp->sibling)
   {
      cdsize=0;
      nsize++;   // count the current node
      nsize+=calc_nsize(mnp->child, &cdsize);  // recursive calc. for its child
      (*dsize)+=mnp->ndim;  // dim. for current data point
      (*dsize)+=cdsize;
   }
   return(nsize);
}


// reorganize and reallocate the mean nodes their associated data for saving
// in disk image
void reorganization(MEAN_NODE *mnp, FILE *fp)
{
   int i, j, nsize, dsize=0;
   MEAN_NODE *nroot, *tmnp;
   double *ndata, *hdata;

   nsize=calc_nsize(mnp, &dsize);  // calc. #nodes and total data dim.
   fwrite(&nsize, sizeof(int), 1, fp); // #mean nodes
   fwrite(&dsize, sizeof(int), 1, fp); // total dimensions for all mean nodes

   // allocate a big continuous memory for mean nodes and their data
   nroot=(MEAN_NODE *)malloc(sizeof(MEAN_NODE)*nsize);
   hdata=ndata=(double *)malloc(sizeof(double)*dsize);
   if (nroot==NULL || ndata==NULL)
   {
      fprintf(stderr, "reorganization(): malloc error.\n");
      exit(1);
   }

   j=0;
   while (mnp!=NULL)  // for all the mean nodes on the top level
   {
      // copy a mean node to its newly allocated position
      nroot[j].ndim=mnp->ndim;
      nroot[j].data=ndata;  // copy data to its newly allocated position
      ndata+=mnp->ndim;
      memcpy(nroot[j].data, mnp->data, sizeof(double)*mnp->ndim);
      nroot[j].radius=mnp->radius;
      nroot[j].child=mnp->child;
      nroot[j].sibling=&nroot[j+1];
      mnp=mnp->sibling;
      j++;
   }
   nroot[j-1].sibling=NULL;

   // sequentially process the children for all nodes in Breadth-First order
   for (i=0; i<nsize; i++)
      if (nroot[i].ndim!=dim)  // no need for leaf node
      {
         mnp=nroot[i].child;
	 nroot[i].child=&nroot[j];  // update the position for the child node
         while (mnp!=NULL)
         {
            nroot[j].ndim=mnp->ndim;
            nroot[j].data=ndata;
            ndata+=mnp->ndim;
            memcpy(nroot[j].data, mnp->data, sizeof(double)*mnp->ndim);
            nroot[j].radius=mnp->radius;
	    if (mnp->child)                              // internal node
               nroot[j].child=mnp->child;
	    else                                         // leaf node
	       nroot[j].child=(MEAN_NODE *)((mnp->data-pointsdb)/dim);  
	                                                 // index in data set
            nroot[j].sibling=&nroot[j+1];
	    mnp=mnp->sibling;
	    j++;
         }
         nroot[j-1].sibling=NULL;
      }

   // Reallocate the address by using relative addressing for saving in file
   for (i=0; i<nsize; i++)
   {
      mnp=&nroot[i];
      mnp->data = (double *)(mnp->data-hdata);
      if (mnp->sibling)
         mnp->sibling = (MEAN_NODE *)(mnp->sibling-nroot);
      if (mnp->ndim!=dim)
         mnp->child = (MEAN_NODE *)(mnp->child-nroot);
   }

   // save the tree structure and the associated data points in file
   fwrite(nroot, sizeof(MEAN_NODE), nsize, fp);
   fwrite(hdata, sizeof(double), dsize, fp);

   free(nroot);
   free(hdata);
}


int main(int argc, char *argv[])
{
   int i, tmpv;
   double *points;
   FILE *fp;
   MEAN_NODE *mnp;
   struct timeb ts, tp;
   double elapse_time;
 
   if (argc != 3)
   {
      fprintf(stderr, "Usage: %s sp_fname ssp_fname\n", argv[0]);
      fprintf(stderr, "\t  sp_fname: input file of the sample points\n");
      fprintf(stderr, "\t ssp_fname: output file of the sample point tree structure\n");
      exit(1);
   }

   // read the data set of sample points
   points=read_points(argv[1], &dim, &pnum);
 
   if ( points==NULL)
   {
      fprintf(stderr, "Reading for sample points failed\n");
      exit(1);
   }

   system("date");
   ftime(&ts);

   initialize_transformation(dim);

   pointsdb = (double *)malloc(sizeof(double)*pnum*dim);
   smean = (double *)malloc(sizeof(double)*dim);

   if ( pointsdb==NULL || smean==NULL)
   {
      fprintf(stderr, "malloc error.\n");
      exit(1);
   }

   for (i=0; i<pnum; i++)
   {
      transform(points+i*dim, pointsdb+i*dim);  // data transform
   }

   // link all the sample points and cluster them
   for (i=0; i<pnum; i++)
   {
      mnp = (MEAN_NODE *)malloc(sizeof(MEAN_NODE));
      mnp->ndim=dim;
      mnp->data=pointsdb+i*dim;    // index is implied in the pointer
      mnp->radius=0;
      mnp->sibling=root;
      mnp->child=NULL;
      root=mnp;
   }
   root=clustering(root, 0);

   fp = fopen(argv[2], "wb");
   if (fp==NULL)
   {
      fprintf(stderr, "File %s open error!\n", argv[2]);
      exit(1);
   }
   fwrite(&dim, sizeof(int), 1, fp);
   fwrite(&pnum, sizeof(int), 1, fp);
   reorganization(root,fp);
   fclose(fp);

   ftime(&tp);
   system("date");
	  
   elapse_time = (double) tp.time*1000 + (double) tp.millitm -
                 (double) ts.time*1000 - (double) ts.millitm;
   printf("Preprocessing time: %f ms\n", elapse_time);
}
