/*
 * File: gen_schedules.c
 * Auth: Jae-Yong Lee (jylee@cs.umd.edu)
 *       University of Maryland, College Park
 * Desc: Functions for generating schedules.
         function names followed by CC for two compact descriptors.
	 CN for one compact and one non-compact descriptors.
 * Date: Feb. 2004
 */

#include "gen_schedules.h"
#include "error.h"
#include <stdlib.h>

interXs* AllocateInterXsList()
{
  interXs* res;
  
  res =(interXs*)malloc(sizeof(interXs));
  res->num = 0;
  res->first = res->last = NULL;
  
  return res;
}

interX* AllocateInterX(int nDim)
{
  interX* res;
  
  res =(interX*)malloc(sizeof(interX));
  
  res->nDims = nDim;
  res->size = (int*)malloc(nDim*sizeof(int));
  res->stride_LZ = (int*)malloc(nDim*sizeof(int));
  res->stride = (int*)malloc(nDim*sizeof(int));
  res->start_gap = (int*)malloc(nDim*sizeof(int));
  
  res->start_LZ = -1;
  res->end_LZ = -1;
  res->length_LZ = -1;
  res->numPieces = -1;
  res->b = NULL;
  res->next = NULL;
  res->regionNum = -1;
  
  return res;
}

void FreeInterXList(interXs* list)
{
  interX* trace;
  interX* old_trace;
  
  for(trace=list->first; trace; )
    {
      old_trace = trace;
      trace = trace->next;
      
      FreeInterX(old_trace);
    }
  free(list);
}

/* Free interX */
void FreeInterX(interX* node)
{
  if(node)
    {
      free(node->size);
      free(node->stride_LZ);
      free(node->stride);
      free(node->start_gap);
      free(node);
    }
}


void Add_InterX(interXs* list, interX* item)
{
  if(list->num == 0)
    list->last = item;
  
  item->next = list->first;
  list->first = item;
  list->num++;
}

/* 
   Generate schedules for the case with two compact data descriptors.
   Generate them only for this process if tab is 'Null'.
   Otherwise, it computes schedules for both programs. 
*/

sched_s* GenerateSchedules_Two_Compacts(sched_s** tab,
					int numofblocks,
					int otherNumNodes,
					block **mine,
					block *** others,
					setOfRegion_s *my_ar_Set  ,
					setOfRegion_s *other_ar_Set, 
					int* numblocksothers,
					int col_major,
					int col_major_other
					)
{
  int i,j,k;
  interXs* my_interXs;
  interXs* other_interXs;
  sched_s* sched;
  
  interXs** my_interXs_List;
  interXs*** other_interXs_Dic;

  sched = CreateSched(otherNumNodes);

  /* 
   * Get intersections(interXs) of data blocks and regions for this process 
   * and all processes in other program. 
   */
  
  my_interXs_List = (interXs**)malloc(numofblocks*sizeof(interXs*));
  other_interXs_Dic = (interXs***)malloc(otherNumNodes*sizeof(interXs**));
  
  for(i=0; i< otherNumNodes; i++)
    other_interXs_Dic[i] = (interXs**)malloc(numblocksothers[i]*sizeof(interXs*));
  
  for(i=0; i< numofblocks; i++)
    {
      my_interXs_List[i] = gen_intersection_boxes(mine[i],my_ar_Set);
    }
  
  for(i=0; i < otherNumNodes; i++)
    {
      for(j =0; j < numblocksothers[i]; j++)
	{
	  other_interXs_Dic[i][j] = gen_intersection_boxes(others[i][j],other_ar_Set);  
	}
    }

  /*
    Generate schedules using interXs of both programs. 
    Currently sequential search. To be modified for the tree search. 
  */


  for(i = 0; i < numofblocks; i++)
    {
      for(j=0; j < otherNumNodes ; j++)
	{ 
	  for(k=0; k < numblocksothers[j]; k++)
	    {
	      compute_add_scheds_CC(my_interXs_List[i],
				    other_interXs_Dic[j][k],
				    sched,
				    j,
				    mine[i],
				    my_ar_Set,
				    tab,
				    others[j][k],
				    other_ar_Set,
				    col_major,
				    col_major_other);
	    }
	}
    }
  
  /* Free memory */
  
  for(i=0; i< numofblocks; i++)
    {
      FreeInterXList(my_interXs_List[i]);
    }
  
  for(i=0; i < otherNumNodes; i++)
    {
      for(j =0; j < numblocksothers[i]; j++)
	{
	  FreeInterXList(other_interXs_Dic[i][j]);
	}
    }
  
  for(i=0; i< otherNumNodes; i++){
    free(other_interXs_Dic[i]);
  }
  
  free(my_interXs_List);
  free(other_interXs_Dic);
  
  return sched;
}


/*
 * Generate schedules for the case with one compact and one non-compact.
 * This function should be called in non-compact program and the program
 * compute schedules for both programs. The program with the compact data
 * descriptor is idle. 
 * This function generate schedules for both programs and put them in 
 * tab_other and tab_me. 
 */

void GenerateSchedules_Mixed_On_Non_Compact(sched_s** tab_other,
					    sched_s** tab_me,
					    void* ttable,
					    int otherNumNodes,
					    block *** others,
					    setOfRegion_s *compact_ar_Set, 
					    int* numblocksothers,
					    int mem_layout,
					    pgme_s* my_pgme,
					    int overall_size)
{
  int i;
  int me;
  int from,to;
  interXs** Dic; 
  
  me = MyPosMyPgme();

  /* Calculate the responsible part of the linearization(from~to) */ 
  from = me*(overall_size/NumNodePgme(my_pgme));
  
  if(me==NumNodePgme(my_pgme)-1){
    to = overall_size-1;
  }
  else to = (me+1)*(overall_size/NumNodePgme(my_pgme)) -1;
  
  /* Calculate interXs for the compact data descriptor and data blocks */
  Dic =  build_Dic_intersection_boxes(others,compact_ar_Set,otherNumNodes,numblocksothers,from,to);
  
  /* Compute whole schedules */
  compute_whole_schedules_CN_mixed(tab_other,tab_me,Dic,ttable,compact_ar_Set,from,to,mem_layout);
  
  /* Free memory */
  for(i=0; i< compact_ar_Set->NumRegion; i++)
    FreeInterXList(Dic[i]);
  
  free(Dic);
  
}


/*
 * Generate schedules for the case with one compact and one non-compact 
 * data descriptors. Both compact and non-compact programs compute schedules 
 * for both sides.
 * Computed schedules are put in tab_other and tab_me.
 */

void GenerateSchedules_Mixed_Both(sched_s** tab_other,
				  sched_s** tab_me,
				  void* ttable,
				  int otherNumNodes,
				  block *** others,
				  setOfRegion_s *compact_ar_Set, 
				  int* numblocksothers,
				  int mem_layout,
				  pgme_s* my_pgme,
				  pgme_s* other_pgme,
				  int overall_size,
				  int order)
{
  int me;
  int from, to;  
  int i;
  interXs* other_interXs;
  interX* p;  
  interXs** Dic;  
  
  me = MyPosMyPgme();

  /* Calculate the responsible part of the linearization(from~to) */
  
  if (order == 0) {
    from = me*(overall_size/(NumNodePgme(my_pgme)+ NumNodePgme(other_pgme)));
    to = (me+1)*(overall_size/(NumNodePgme(my_pgme)+ NumNodePgme(other_pgme))) - 1;
  }
  
  if (order == 1) {
    from = ((me+NumNodePgme(other_pgme))*(overall_size/(NumNodePgme(my_pgme)+NumNodePgme(other_pgme))));
    
    if (me == NumNodePgme(my_pgme)-1)
      to = overall_size-1;
    else 
      to =  (me+NumNodePgme(other_pgme)+1)*(overall_size/(NumNodePgme(my_pgme)+NumNodePgme(other_pgme)))-1;
  }
  
  /* Calculate interXs for the compact data descriptor and data blocks */
  Dic =  build_Dic_intersection_boxes(others, compact_ar_Set, otherNumNodes, numblocksothers, from, to);

  /* Compute whole schedules */
  compute_whole_schedules_CN_mixed(tab_other, tab_me, Dic, ttable, compact_ar_Set, from, to,mem_layout);

  /* Free memory */
  for (i = 0; i < compact_ar_Set->NumRegion; i++)
    FreeInterXList(Dic[i]);  
  free(Dic);
}

/*
 * Generate schedules for the case with two non-compact.
 * This function generate schedules for both programs and put them in
 * tab_sched_other and tab_sched_me.
 */

void GenerateSchedules_Two_Non_Compacts(sched_s** tab_sched_me,
					sched_s** tab_sched_other,
					simple_ttable* table_me,
					simple_ttable* table_other,
					pgme_s* my_pgme,
					pgme_s* other_pgme,
					int OverallSize)
{
  int i;
  
  /* Simply generate and put schedules */
  for (i = 0; i < table_me->nData; i++) {
    tab_sched_me[table_me->proc_num[i]] = AddSchedLast(tab_sched_me[table_me->proc_num[i]], table_other->proc_num[i], table_me->local_offset[i], 1);
    
    tab_sched_other[table_other->proc_num[i]] = AddSchedLast(tab_sched_other[table_other->proc_num[i]], table_me->proc_num[i], table_other->local_offset[i], 1); 
    }
}


/*
 * Calculate intersections of a data block from a compact descriptor 
 * and a set of regions.
 * And put the intersections in a list(interXs)
 */
interXs* gen_intersection_boxes(block* b,setOfRegion_s* ar_Set)
{
  int i,j; 
  int* leftX;
  int* rightX;
  interX* current; 
  interXs* res;  
  int tmp ;
  int No_interX;
  region_hpf_s* region;

  res = AllocateInterXsList();
  
  leftX = (int*)malloc(((region_hpf_s*)(ar_Set->regions[0]))->nbDim * sizeof(int));
  rightX = (int*)malloc(((region_hpf_s*)(ar_Set->regions[0]))->nbDim * sizeof(int));
  
  for (i = 0; i < ar_Set->NumRegion; i++) {      
    region = (region_hpf_s*)ar_Set->regions[i];
    No_interX = 0;

    /* Calculate the global boundary of interXs */
    for (j = 0; j < region->nbDim; j++) {
      
      if (b->start_global[j] > region->left[j]) {
	if (region->right[j] < b->start_global[j]) { /* case 5 */
	  /* no interX */
	  No_interX = 1;
	  continue;
	} 
	else if (region->right[j] <= b->end_global[j]) { /*case 2 */
	  leftX[j] = b->start_global[j];
	  rightX[j] = region->right[j];
	} 
	else { /* case 3*/ 
	  leftX[j] = b->start_global[j];
	  rightX[j] = b->end_global[j];
	}
      } 
      else if (region->left[j] <= b->end_global[j]) {
	if (region->right[j] <= b->end_global[j]) { /*case 4*/ 
	  leftX[j] = region->left[j];
	  rightX[j] = region->right[j];
	} 
	else { /* case 1 */
	  leftX[j] = region->left[j];
	  rightX[j] = b->end_global[j];
	}
      } 
      else { /* case 6 */
	/* no interX */
	No_interX = 1;
	continue;
      }
    }
    
    /* the region intersects with the data block */
    if (!No_interX) {
      current = AllocateInterX(b->nDims);
      current->b = b;
      current->regionNum = i;
	
      /* Calculate gaps of intersection and actual starting positions.  There might be gaps if the stride is not 1. */
      for (j =0; j < region->nbDim; j++) {
	current->stride[j] = region->stride[j];
	/* adjust leftX and rightX */
	leftX[j] += (current->stride[j] - (leftX[j]-region->left[j])%current->stride[j])%current->stride[j];
	rightX[j] -= (rightX[j]-region->left[j])%current->stride[j];   
	
	/* No data elements in that intersection area */
	
	if(leftX[j] > rightX[j]) {
	  No_interX = 1;
	  break;
	}
      }
      
      if (No_interX) {
	FreeInterX(current);
      } 
      else {
	/* 
	   Note-JYLEE : I leave codes for regions in row major order.
	*/
	
	/* If the stride of the last dimension is >1, no elements */
	/* are contiguous in the memory layout.                   */
	/* So, need one schedule entry for each data element.     */
	
	/*if (current->stride[region->nbDim-1 ] > 1) */ /* When the regions are defined in a Row major order */
	if(current->stride[0]> 1) /* Col major */
	  current->length_LZ = 1;
	else 
	  /*current->length_LZ = rightX[region->nbDim-1]-leftX[region->nbDim-1]+1; */ /* Row major */
	  current->length_LZ = rightX[0] - leftX[0] +1; /* column major */
	
	current->start_LZ = current->end_LZ = ar_Set->BeginGlobalOffset[i];
	
	tmp = 1;
	current->numPieces = 1;
	
	for (j = 0; j < region->nbDim; j++) {		  
	  current->size[j] = (rightX[j] - leftX[j])/current->stride[j] + 1;
	  current->numPieces *= current->size[j];
	  tmp *= (region->right[j] - region->left[j])/current->stride[j] + 1;
	}
	
	/*
	if (current->stride[region->nbDim-1] == 1) {
	  current->numPieces /= current->size[region->nbDim-1];
	}
	*/ /* Row major */
	
	if(current->stride[0] == 1)   /* column major */
	  {
	    current->numPieces /= current->size[0] ;  
	  }
	
	/* for (j = 0; j < region->nbDim; j++) */ /* Row major */
	for(j  = region->nbDim -1   ; j >= 0; j--) /* Col major */
	  {
	    tmp /= (region->right[j] - region->left[j])/current->stride[j]+1;
	    current->start_LZ += (leftX[j] - region->left[j])/current->stride[j]*tmp;
	    current->end_LZ += ((rightX[j] - region->left[j])/current->stride[j])*tmp;
	}
	
	current->next = NULL;
	
	tmp = 1;
	
	/* for (j = region->nbDim-1; j > 0 ; j--) { */ /* Row major */
	for(j =0; j < region->nbDim -1; j++) {/* Col major */
	  tmp *= (region->right[j] - region->left[j])/current->stride[j]+1;
	/* current->stride_LZ[j-1] = tmp; */ /* Row major */
	current->stride_LZ[j+1]= tmp;  /* Col major */
	}
	
	Add_InterX(res, current);
      }
    }
  }
  
  free(leftX);
  free(rightX);
  
  return res;
}


/*
 * Is it possible that two interXs are intersected?
 * Simple. Just see start and end positions of interXs. 
 */

int Is_intersect(interX* p, interX* q)
{
  if((p->start_LZ > q->end_LZ) || (p->end_LZ < q->start_LZ))
    return 0;
  else return 1;
}


/* 
 * Compute schedules for two interX lists. One is from this program 
 * and the other from other program. 
 */

void compute_add_scheds_CC(interXs* my_X, 
			   interXs* other_X,
			   sched_s* sched,
			   int which_proc,
			   block* b,
			   setOfRegion_s* ar_Set,
			   sched_s** tab,
			   block* other_b,
			   setOfRegion_s* other_ar_Set,
			   int col_major,
			   int col_major_other)
{ 
  interX* p1;
  interX* p2;
  
  p1= my_X->first;
  p2 = other_X->first;
  
  while(p1){
    p2= other_X->first;
    
    while(p2){ 
      if(Is_intersect(p1,p2)){
	
	comp_add_scheds_two_interXs_CC(p1,p2,sched,which_proc,b,ar_Set,tab,other_b,other_ar_Set,col_major, col_major_other); 
      }
      p2 = p2->next;
    }
    p1= p1->next;
  } 
}


/* 
 * Get a next contiguous chunk of interX.
 * It calculates start and end positions of the piece 
 * by calculating the coordinate of the piece using 'count'. 
 * 'count' == rank of the piece in the interX. 
 */

void get_next_piece(interX* p,int* st, int* end,int count)
{ 
  int i;
  int tmp;
  int* coord_piece;
  int upto;
  
  /* if(p->stride[p->nDims-1] !=1) */ /* Row major */
  if(p->stride[0] !=1)  /* Col major */
    upto = p->nDims-1;
  else upto = p->nDims-2;
  
  /* coord_piece = (int*)malloc((upto+1)*sizeof(int));  */ /* row major */
  coord_piece = (int*)malloc( p->nDims*sizeof(int)); /* Col major */
 
  tmp = 1;
  
  /*
    for(i = 0; i < upto+1; i++)
    tmp *= p->size[i];
  */  /* Row major */

  for(i = 0; i < upto+1; i++)  /* Col major */
    tmp *= p->size[p->nDims-1-i];


  /* Calculate the coordinate of the chunk */
  
  /*
    for(i= 0; i < upto ; i++) 
    {
      tmp /= p->size[i];
      coord_piece[i] = count/tmp;
      count %= tmp;  
    }
  */  /* Row major */

  for(i= 0; i < upto ; i++)   /* Col major */
    {
      tmp /= p->size[p->nDims-1-i];
      coord_piece[p->nDims-1-i] = count/tmp;
      count %= tmp;  
    }

  
  /* coord_piece[i] = count%tmp; */  /* Row major */
  coord_piece[p->nDims-1-i] = count%tmp;   /* Column major */
  
  *st = p->start_LZ;
  
  /*
    for(i=0; i< p->nDims -1; i++) 
    *st+= p->stride_LZ[i]*coord_piece[i];
    
  */ /* Row major */


  for(i=0; i< p->nDims -1; i++)   /* Col major */
    *st+= p->stride_LZ[p->nDims-1 -i]*coord_piece[p->nDims-1-i];
  

  /*if(p->stride[p->nDims-1] !=1)   */  /* Row major */
  if(p->stride[0] !=1)    /* Col major */
    *st += coord_piece[0];
  *end = *st+p->length_LZ -1;
  
  
  free(coord_piece);
}


/*  Calculate schedules for two interXs */
void comp_add_scheds_two_interXs_CC(interX* p, 
				    interX* q,
				    sched_s* scheds,
				    int which_proc,
				    block* b,
				    setOfRegion_s* ar_Set,
				    sched_s** tab_scheds_other,
				    block* other_b,
				    setOfRegion_s* other_ar_Set,
				    int col_major,
				    int col_major_other)
{
  int st1,st2,end1,end2;
  int count1,count2;
  int one_by_one_schedule; 

  /* 
   * If one of two multi-dimenstional arrays is not in column major order,
   * schedule entries should be generated one by one data element.
   */
  if(col_major != 1 || col_major_other != 1) one_by_one_schedule = 1;
  else one_by_one_schedule = 0;
  
  count1 =count2 = 0;

  /* Get the first chunk */
  st1 = p->start_LZ;
  end1 = p->start_LZ + p->length_LZ -1;
  
  st2 = q->start_LZ;
  end2 = q->start_LZ + q->length_LZ -1;
  
  /*
   * There are 6 cases that two chunks can be positioned. 
   * Followings compute intersections based on the cases.
   * It gets a new chunk based on the cases.  
   */
  

  while((count1 < p->numPieces) && (count2 < q->numPieces )) 
    { 
      if(st1 > st2){
	if(end2 < st1){ /* case 5 */ /* No interX */
	  count2++;
	  get_next_piece(q,&st2,&end2,count2);
	}
	else if(end2 <= end1){ /* case 2 */
	  
	  if(one_by_one_schedule == 0){ 
	    /* Compute schedules for my program */
	    put_scheds_CC(st1,end2,scheds,which_proc,b,ar_Set);
	    /* Compute schedules for other program */
	    if(tab_scheds_other) put_scheds_CC(st1,end2,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set);
	  }
	  else{
	    put_scheds_CC_one_by_one(st1,end2,scheds,which_proc,b,ar_Set, col_major);
	    if(tab_scheds_other) put_scheds_CC_one_by_one(st1,end2,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set, col_major_other);

	  }
	  
	  count2++;
	  st1 = end2+1;
	  
	  get_next_piece(q,&st2,&end2,count2);
	  if(st1>end1){
	    count1++;
	    get_next_piece(p,&st1,&end1,count1);
	  }
	}
	else { /* case 3*/
	  
	  if(one_by_one_schedule ==0){
	    put_scheds_CC(st1,end1,scheds,which_proc,b,ar_Set);
	    if(tab_scheds_other) put_scheds_CC(st1,end1,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set);
	  }
	  else{
	    put_scheds_CC_one_by_one(st1,end1,scheds,which_proc,b,ar_Set, col_major);
	    if(tab_scheds_other) put_scheds_CC_one_by_one(st1,end1,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set, col_major_other);
	  }
	  
	  count1++;
	  
	  st2 = end1+1;
	  get_next_piece(p,&st1,&end1,count1);
	  
	  if(st2>end2){
	    count2++; 
	    get_next_piece(q,&st2,&end2,count2);
	  } 
	}
      }
      else if(st2 <= end1){
	if(end2 <= end1){ /* case 4*/
	  
	  if(one_by_one_schedule == 0){
	    put_scheds_CC(st2,end2,scheds,which_proc,b,ar_Set);
	    if(tab_scheds_other) put_scheds_CC(st2,end2,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set);	  
	  }
	  else{
	    put_scheds_CC_one_by_one(st2,end2,scheds,which_proc,b,ar_Set, col_major);
	    if(tab_scheds_other) put_scheds_CC_one_by_one(st2,end2,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set, col_major_other);
	  }
	  
	  count2++;
	  st1 = end2+1;
	  get_next_piece(q,&st2,&end2, count2);
	  
	  if(st1>end1){
	    count1++; 
	    get_next_piece(p,&st1,&end1, count1);
	  }
	}
	else { /* case 1 */
	  
	  if(one_by_one_schedule ==0){
	    put_scheds_CC(st2,end1,scheds,which_proc,b,ar_Set);
	    if(tab_scheds_other) put_scheds_CC(st2,end1,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set);
	  }
	  else{
	    put_scheds_CC_one_by_one(st2,end1,scheds,which_proc,b,ar_Set, col_major);
	    if(tab_scheds_other) put_scheds_CC_one_by_one(st2,end1,tab_scheds_other[which_proc],MyPosMyPgme(), other_b,other_ar_Set, col_major_other);
	  }
	  
	  count1++; 
	  st2 = end1+1;
	  get_next_piece(p,&st1,&end1, count1);
	  
	  if(st2>end2){
	    count2++;
	    get_next_piece(q,&st2,&end2, count2);
	  }
	}
      }
      else { /* case 6 */ /* no interX */
	count1++;
	get_next_piece(p,&st1,&end1, count1);
      }
    }
}


/* 
 * Add schedules for a contiguous intersection chunck in the linearization.
 * Since the intersection is a contiguous chunck in the memory layout, 
 * only one schedule entry is generated. 
 *
 * 1. Find corresponding global addresses.
 * 2. Translate the global addrs into local addrs.
 * 3. Add schedules. 
 */

void  put_scheds_CC(int st,int end,sched_s* scheds,int which_proc,block* b,setOfRegion_s* ar_Set)
{
  int* global_addr;
  int* diff;
  
  int tmp;
  int i;
  int dim;
  int local_st;
  int index;
  int size;
  region_hpf_s* region;
  int region_size;
  
  size = end -st + 1;
  dim = b->nDims;
  
  global_addr = (int*)malloc(dim * sizeof(int));
  diff =  (int*)malloc(dim * sizeof(int));
  
  /* Find which region includes this chunk */
  index = -1;
  for(i=0; i< ar_Set->NumRegion ; i++)
    {
      if((ar_Set->BeginGlobalOffset[i] > st)){
	index = i-1;
	break;
      }
    }
  if(index == -1) index = ar_Set->NumRegion - 1;
  
  /* Get the starting index in the region. */
  st= st - ar_Set->BeginGlobalOffset[index];
  
  region_size =  ((region_hpf_s*)(ar_Set->regions[index]))->size;
  region = (region_hpf_s*)(ar_Set->regions[index]);


  /* Get the global address of the start position. */ 
  /*  for (i = 0; i < dim - 1; i++) {   */ /* Row major */
  for (i = dim-1; i > 0; i--) {  /* Col major */

    global_addr[i] = region->left[i] 
      + (st/(region_size/((region-> right[i]- region->left[i])/region->stride[i]+1)))*region->stride[i] ;
    
    st -= (global_addr[i]-region->left[i])/region->stride[i]
      *(region_size/((region-> right[i]-region->left[i])/region->stride[i]+1));
    
    region_size /= ((region->right[i]-region->left[i])/region->stride[i]+1);
  }
  
  global_addr[i] = region->left[i] + st*region->stride[i];
  


  /* Calculate the local address of the starting position in the local data block */

  for(i=0; i < dim; i++)
    diff[i] = global_addr[i] - b->start_global[i];
  
  tmp = 1;
  
  
  /* for (i = 1; i < dim; i++)  */ /* Row major */
  for (i = 0; i < dim-1; i++) /* Col major */
    tmp *= b->end_global[i]- b->start_global[i] + 1;
  
  local_st = 0;
  
  /*  for (i = 0; i < dim - 1; i++) {   */ /* Row major */
  for (i = dim-1; i >0 ; i--) { /* Col major */
    local_st += tmp* diff[i];
    /*tmp /= b->size[i+1]; */   /* Row major */
    tmp /= b->size[i-1];        /* Col major */
  }
  
  local_st += diff[i];

  /* Put one schedule entry for the contiguous intersection piece into the schedule table. */
  scheds = AddSchedLast(scheds,which_proc,local_st,size);

  free(diff);
  free(global_addr);
}  


/* Same as the above put_scheds_CC function.
 * But the contiguous chunk is not contiguous in the memory layout.
 * (When memory layout is in different order with the region. Row-major(data layout) and Col-major(regions)) 
 * So different calculation for translating global addr into local addr
 * and need one schedule entry for each data element. 
 *
 * Note-JYLEE : If we change the schedule data structure(adding one more field for stride), 
 * only one entry needed. However, it may degrade the performance for normal case(Col-major) 
 * due to the more amount of schedules.
 */

void  put_scheds_CC_one_by_one(int st,int end,sched_s* scheds,int which_proc,block* b,setOfRegion_s* ar_Set, int col_major)
{
  int* global_addr;
  int* diff;
  
  int tmp;
  int i; 
  int dim;
  int local_st;
  int index;
  int size;
  region_hpf_s* region;
  int region_size;
  int gap ; 


  size = end -st + 1;
  
  dim = b->nDims;
  
  global_addr = (int*)malloc(dim * sizeof(int));
  diff =  (int*)malloc(dim * sizeof(int));
  
  index = -1;
  for(i=0; i< ar_Set->NumRegion ; i++)
    {
      if((ar_Set->BeginGlobalOffset[i] > st)){
	index = i-1;
	break;
      }
    }
  if(index == -1) index = ar_Set->NumRegion - 1;
  
  st= st - ar_Set->BeginGlobalOffset[index];
  
  region_size =  ((region_hpf_s*)(ar_Set->regions[index]))->size;
  region = (region_hpf_s*)(ar_Set->regions[index]);
 

  /*  for (i = 0; i < dim - 1; i++) {   */ /* Row major */
  for (i = dim-1; i > 0; i--) {  /* Col major */

    global_addr[i] = region->left[i] 
      + (st/(region_size/((region-> right[i]- region->left[i])/region->stride[i]+1)))*region->stride[i] ;
    
    st -= (global_addr[i]-region->left[i])/region->stride[i]
      *(region_size/((region-> right[i]-region->left[i])/region->stride[i]+1));
    
    region_size /= ((region->right[i]-region->left[i])/region->stride[i]+1);
  }
  
  global_addr[i] = region->left[i] + st*region->stride[i];
  

  /* Different translation method from global to local.
   * Due to the memory layout is not same as the data elment order 
   * in regions. For now, regions are defined in column major order.
   */
  

  for(i=0; i < dim; i++)
    diff[i] = global_addr[i] - b->start_global[i];
  
  tmp = 1;
 
  if(col_major ==0) { /* if row major */
    for (i = 1; i < dim; i++)   
      tmp *= b->end_global[i]- b->start_global[i] + 1;
    
    local_st = 0;
    
    for (i = 0; i < dim - 1; i++) { 
      local_st += tmp* diff[i];
      tmp /= b->size[i+1];    
    }
  }
  else if( col_major == 1){ /* if col major */
    for (i = 0; i < dim-1; i++) 
      tmp *= b->end_global[i]- b->start_global[i] + 1;
    
    local_st = 0;
    
    for (i = dim-1; i >0 ; i--) { 
      local_st += tmp* diff[i];
      tmp /= b->size[i-1];      
    }
  }
  else{
    errormsg("invalid parameter for column major");
  }
  
  local_st += diff[i];
  gap = 1;
  for( i = 1; i < dim; i++)
    gap *= b->size[i];
  
  if(col_major ==1){
    /*
      for(i=0; i< size; i++)
      scheds = AddSchedLast(scheds,which_proc,local_st+i,1);
    */
    
    scheds = AddSchedLast(scheds,which_proc,local_st,size);
  }
  else if(col_major ==0){
    for(i=0; i< size; i++){
      scheds = AddSchedLast(scheds,which_proc,local_st+i*gap,1);
    }
  }
  else {
    errormsg("invalid parameter for column major");
  }
  
  free(diff);
  free(global_addr);
} 


/* 
 * Add schedules for a contiguous intersection chunck in the linearization.
 * One compact and one non-compact data descriptor.
 * Since data elements may not contiguous in memory layout due to the non-compact, 
 * one schedule per one data element.
 * Need different translation method to get local addr if the local data block 
 * is not in column major order.
 */

void  put_scheds_CN(int st,
		    int end,
		    sched_s** tab_me,
		    sched_s** tab_other,
		    block* b, 
		    setOfRegion_s* ar_Set,
		    simple_ttable* ttable,
		    int from, 
		    int col_major)
{
  int* global_addr;
  int* diff;
  int tmp;
  int i;
  int dim;
  int local_st;
  int index;
  int size;
  int save_st, save_end;
  int region_size;
  region_hpf_s* region;
  int gap;

  save_st = st;
  save_end = end;
  
  size = end - st + 1;
  dim = b->nDims; 
  
  global_addr = (int*)malloc(dim*sizeof(int));
  diff = (int*)malloc(dim*sizeof(int));
  
  index = -1;
  for (i = 0; i < ar_Set->NumRegion; i++) {
    if ((ar_Set->BeginGlobalOffset[i] > st)) {
      index = i-1;
      break;
    }
  }
  
  if (index == -1) 
    index = ar_Set->NumRegion - 1;  
  st = st - ar_Set->BeginGlobalOffset[index];
  
  region_size = ((region_hpf_s*)(ar_Set->regions[index]))->size;  
  region = (region_hpf_s*)(ar_Set->regions[index]);
  /*  for (i = 0; i < dim - 1; i++) {   */ /* Row major */
  for (i = dim-1; i > 0; i--) {   /* Col major */
    global_addr[i] = region->left[i]
      + (st/(region_size/((region->right[i]-region->left[i])/region->stride[i]+1)))*region->stride[i];
    st -= (global_addr[i]-region->left[i])/region->stride[i]
      * (region_size/((region->right[i]-region->left[i])/region->stride[i]+1));    
    region_size /= ((region->right[i]-region->left[i])/region->stride[i]+1);
  }
  
  global_addr[i] = region->left[i] + st*region->stride[i];
  
  for (i = 0; i < dim; i++)
    diff[i] = global_addr[i] - b->start_global[i];
  
  tmp = 1;
  
  if(col_major ==0) { /* Row major */
    
    for (i = 1; i < dim; i++)   
      tmp *= b->end_global[i]- b->start_global[i] + 1;
    
    local_st = 0;
    
    for (i = 0; i < dim - 1; i++) {    
      local_st += tmp* diff[i];
      tmp /= b->size[i+1];   
    }
  }
  
  else if(col_major ==1){ /* Col major */
    for (i = 0; i < dim-1; i++) 
      tmp *= b->end_global[i]- b->start_global[i] + 1;
    
    local_st = 0;
    
    for (i = dim-1; i >0 ; i--) { 
      local_st += tmp* diff[i];
      tmp /= b->size[i-1];     
    }
  }
  else {
    errormsg("invalid parameter for column major");
  }
  
  local_st += diff[i];
  

  gap = 1;
  for( i = 1; i < dim; i++)
    gap *= b->size[i];
  

  /* add schedules for both programs.. */

  if(col_major ==1){
    for (i = 0; i < size; i++) {
      AddSched(tab_me[ttable->proc_num[save_st+i-from]], b->proc, ttable->local_offset[save_st+i-from], 1);    
      AddSched(tab_other[b->proc], ttable->proc_num[save_st+i-from], local_st+i, 1);
    }
  }
  
  else if(col_major ==0){
    for (i = 0; i < size; i++) {
      AddSched(tab_me[ttable->proc_num[save_st+i-from]], b->proc, ttable->local_offset[save_st+i-from], 1);    
      AddSched(tab_other[b->proc], ttable->proc_num[save_st+i-from], local_st+i*gap, 1);
    }
  }
  
  else {
    errormsg("invalid parameter for column major");
  }

  
  free(global_addr);
  free(diff);  
} 


/* 
 * Compute schedules with one compact and one non-compact descriptor.
 */

void compute_whole_schedules_CN_mixed(sched_s** tab_me,
				      sched_s** tab_recv,
				      interXs** Dic,
				      void* ttable,
				      setOfRegion_s* compact_ar_Set,
				      int from, 
				      int to,
				      int mem_layout)
{
  int i,j;
  int me;
  int done;
  interX* p;
  int st,end;  
  int count;
  
  me = MyPosMyPgme();
  
  for (i = 0; i < compact_ar_Set->NumRegion; i++) { 
    for (p = Dic[i]->first; p; p = p->next) {
      if ((from <= p->end_LZ) && (to >= p->start_LZ)) { /* may be overlapped */
	count = 0;
	done = 0;
	st = p->start_LZ;
	end = st+ p->length_LZ-1;
	
	while ((count < p->numPieces) && !done) { 
	  if (st > from) {
	    if (to < st) {    /* case 5 */ 
	      /* no interX */
	      done = 1;	 
	    }
	    else if (to <= end) {   /* case 2 */  
	      put_scheds_CN(st, to, tab_me, tab_recv, p->b, compact_ar_Set, ttable, from,mem_layout);
	      done =1;
	    }
	    else { /* case 3 */
	      put_scheds_CN(st, end, tab_me, tab_recv, p->b, compact_ar_Set, ttable, from,mem_layout);
	      count++;
	      get_next_piece(p, &st, &end, count);
	    }
	  } 
	  else if (from <= end) {
	    if (to <= end) {   /* case 4 ,same as case 2 */   
	      put_scheds_CN(from, to, tab_me, tab_recv, p->b, compact_ar_Set, ttable, from,mem_layout);
	      done = 1;
	    } 
	    else {   /* case 1 */  
	      put_scheds_CN(from, end, tab_me, tab_recv, p->b, compact_ar_Set, ttable, from,mem_layout); 
	      count++;
	      get_next_piece(p, &st, &end, count);
	    }
	  } 
	  else {   /* case 6 */
	    count++;  
	    get_next_piece(p, &st, &end, count); 
	  }
	}
      }  
    }
  }  
}


/* Build interX directory. 
 * For each region in the linearization range between 'from' and 'to',
 * one list of interXs is generated. 
 * Similar to the gen_intersection_boxes function.
 * May eliminate code redundancy with the function later.
 */

interXs** build_Dic_intersection_boxes(block*** block_set,
				       setOfRegion_s* ar_Set,
				       int numNodes,
				       int* numblocks,
				       int from,
				       int to)
{
  int i,j;
  int m,n;
  int* leftX;
  int* rightX;
  interX* current; 
  block* b;
   
  interXs** Dic;
  
  int tmp ;
  int No_interX;
  
  int start,end;
  int start_done;
  int min_region,max_region;
  
  Dic =(interXs**)malloc(sizeof(interXs*)*(ar_Set->NumRegion));
  
  for(i=0; i< ar_Set->NumRegion;i++){
    Dic[i] = AllocateInterXsList();
  }
  
  if(from <= to)
    {
      leftX = (int*)malloc( ((region_hpf_s*) (ar_Set->regions[0]))->nbDim * sizeof(int));
      rightX = (int*)malloc(((region_hpf_s*) (ar_Set->regions[0]))->nbDim * sizeof(int));
      
      start = from; 
      end = to;
      
      start_done =0;
      
      for(i=0; i< ar_Set->NumRegion;i++)
	{
	  if(!start_done){
	    start -= ((region_hpf_s*) (ar_Set->regions[i]))->size;
	    if(start < 0) {
	      min_region = i;
	      start_done = 1;
	    }
	  }
	  
	  end -= ((region_hpf_s*) (ar_Set->regions[i]))->size;
	  if(start_done){
	    if(end <0){
	      max_region = i;
	      break;
	    }
	  }
	}
      
      for(m =0; m < numNodes; m++)
	{
	  for(n =0; n < numblocks[m]; n++)
	    {
	      b = block_set[m][n];
	      
	      for( i=min_region; (i <= max_region) && (i < ar_Set->NumRegion);i++)
		{      
		  No_interX = 0;
		  for(j=0; j <((region_hpf_s*)( ar_Set->regions[i]))->nbDim;j++)
		    {
		      if(b->start_global[j] >( (region_hpf_s*)(ar_Set->regions[i]))->left[j] ){
			if(((region_hpf_s*)( ar_Set->regions[i]))->right[j]<b->start_global[j]){ /* case 5 */
			  /* no interX */
			  No_interX = 1;
			  break;
			}
			else if(((region_hpf_s*) (ar_Set->regions[i]))->right[j] <=b->end_global[j]){ /*case 2 */
			  leftX[j] = b->start_global[j];
			  rightX[j] = ((region_hpf_s*)( ar_Set->regions[i]))->right[j];
			}
			else { /* case 3*/ /*same as case 1*/
			  leftX[j] = b->start_global[j];
			  rightX[j] = b->end_global[j];
			}
		      }
		      else if(   ((region_hpf_s*)( ar_Set->regions[i]))->left[j] <= b->end_global[j]){
			if(((region_hpf_s*)( ar_Set->regions[i]))->right[j] <= b->end_global[j]){ /*case 4*/ /*same as case 2 */
			  leftX[j] = ((region_hpf_s*) (ar_Set->regions[i]))->left[j];
			  rightX[j] = ((region_hpf_s*) (ar_Set->regions[i]))->right[j];
			}
			else { /* case 1 */
			  leftX[j] = ((region_hpf_s*) ( ar_Set->regions[i]))->left[j];
			  rightX[j] = b->end_global[j];
			}
		      }
		      else { /* case 6 */
			/* no interX */
			No_interX=1;
			break;
		      }
		    }
		  if(!No_interX)
		    { 
		      current = AllocateInterX(b->nDims);
		      current->b = b;
		      current->regionNum = i;
		      
		      /* calculate gaps of intersection and actual starting positions */
		      for(j =0; j < ((region_hpf_s*) ( ar_Set->regions[i]))->nbDim; j++)
			{
			  current->stride[j] =  ((region_hpf_s*) ( ar_Set->regions[i]))->stride[j];
			  /* adjust leftX and rightX */
			  leftX[j] += (current->stride[j] - (leftX[j]- ((region_hpf_s*) (ar_Set->regions[i]))->left[j])%current->stride[j])%current->stride[j] ;
			  
			  rightX[j] -=  (rightX[j]- ((region_hpf_s*) ( ar_Set->regions[i]))->left[j])%current->stride[j];   
			  
			  /* No data elements in that intersection area */
			  
			  if(leftX[j] > rightX[j])
			    {
			      No_interX =1;
			      break;
			    }
			}
		      
		      if(No_interX) FreeInterX(current);
		      else 
			{
			  /* If the stride of the last dimension is >1, no elements */
			  /* are contiguous in the memory layout. (C-style layout)  */
			  /* So, need one schedule entry for each data element.     */
			  
			  /*if(current->stride[((region_hpf_s*) ( ar_Set->regions[i]))->nbDim-1 ]> 1)   */ /* Row major */
			  if(current->stride[0]> 1)  /* Col major */
			    {
			      current->length_LZ =1;
			    }
			  else
			    /* current->length_LZ = rightX[ ((region_hpf_s*) (ar_Set->regions[i]))->nbDim-1] - leftX[((region_hpf_s*) ( ar_Set->regions[i]))->nbDim-1] +1;  
			     */  /* Row major */
			    current->length_LZ = rightX[0] - leftX[0] +1; /* Col major */
			  

			  current->start_LZ = current->end_LZ = ar_Set->BeginGlobalOffset[i];
	      
			  tmp = 1;
			  current->numPieces = 1;
			  
			  for(j = 0; j <((region_hpf_s*) ( ar_Set->regions[i]))->nbDim; j++)
			    {		  
			      current->size[j] = (rightX[j] - leftX[j])/current->stride[j] +1;
			      
			      current->numPieces *= current->size[j];
			      tmp*=  (((region_hpf_s*) (ar_Set->regions[i]))->right[j]  -((region_hpf_s*) ( ar_Set->regions[i]))->left[j])/current->stride[j]+1;
			    }
			  /*
			    if(current->stride[((region_hpf_s*) ( ar_Set->regions[i]))->nbDim-1 ] == 1) 
			    {
			    current->numPieces /= current->size[((region_hpf_s*) ( ar_Set->regions[i]))->nbDim-1  ] ;
			    
			    }
			    
			  */  /* Row major */
			  
			  if(current->stride[0] == 1)  /* Col major */
			    {
			      current->numPieces /= current->size[0] ;
			    }
			  
			  /* for(j = 0; j <((region_hpf_s*) ( ar_Set->regions[i]))->nbDim ; j++)  */  /* Row major */
			  for(j  = ((region_hpf_s*) ( ar_Set->regions[i]))->nbDim -1   ; j >= 0; j--)  /* Col major */
			    
			    {
			      tmp /=(((region_hpf_s*) (ar_Set->regions[i]))->right[j]  -((region_hpf_s*) ( ar_Set->regions[i]))->left[j])/current->stride[j]+1;
			      
			      current->start_LZ += (leftX[j] - ((region_hpf_s*) ( ar_Set->regions[i]))->left[j])/current->stride[j] * tmp;
			      
			      current->end_LZ += ((rightX[j] - ((region_hpf_s*) ( ar_Set->regions[i]))->left[j])/current->stride[j]) * tmp;
			    }
			  
			  current->next = NULL;
			  
			  tmp = 1;
			  			   
			  /* for(j= ((region_hpf_s*) ( ar_Set->regions[i]))->nbDim -1; j >0; j--) */  /* Row major */
			  for(j =0; j < ((region_hpf_s*) ( ar_Set->regions[i]))->nbDim -1; j++)  /* Col major */
			    
			    {
			      tmp *= (((region_hpf_s*) (ar_Set->regions[i]))->right[j]  -((region_hpf_s*) ( ar_Set->regions[i]))->left[j])/current->stride[j]+1;
			      /*current->stride_LZ[j-1]= tmp; */  /* Row major */
			      current->stride_LZ[j+1]= tmp;  /* Col major */
			      
			    }
			  
			  Add_InterX(Dic[current->regionNum], current);
			}
		    }
		}
	    }
	}
      free(leftX);
      free(rightX);
    }
  
  return Dic;
}


/*
 * print interX data structure. 
 */

void Print_interX(interX* p)
{
  int i;
  
  printf("\n----- Intersection -----\n\n");
  
  if(!p) printf("Empty intersection\n");	
  else {
    printf("start_LZ : %d\n", p->start_LZ);
    printf("end_LZ : %d\n", p->end_LZ);
    
    printf("length_LZ : %d\n", p->length_LZ);
    printf("numPieces : %d\n", p->numPieces);
  
    for(i = 0; i < p->nDims -1; i++)
      printf("stride_LZ[%d] : %d\n", i,p->stride_LZ[i]);
    
    for(i = 0; i < p->nDims -1; i++)
      printf("size[%d] : %d\n", i,p->size[i]);
  }
  
  printf("\n-------------------------\n\n");
}
