/* Original version in Hoard Memory Allocator v2.1.2d
 *
 * This is a UNIX port of the latest version of the benchmark described
 * by Larson & Krishnan in "Memory Allocation for Long-Running Server
 * Applications", ISMM 1998.
 * 
 * To see how it scales, try the following parameters, where P = 1 and
 * then the number of processors on your system, for larson and
 * larson_hoard:
 * 
 * Multi-threaded test driver 
 * C++ version (new and delete)
 * runtime (sec): 30
 * chunk size (min,max): 8 16 
 * threads (min, max):   P P
 * chunks/thread:  10000
 * no of rounds:   10
 * random seed:    1
 */

#include <pthread.h>
#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <stdlib.h>
#include <dmmlib/dmmlib.h>
#include <dmmlib/print_stats.h>
#include "lran2.h"

#define MAX_THREADS     100
#define MAX_BLOCKS  1000000

#ifndef BOOLEAN
#define BOOLEAN
enum BOOLEAN { FALSE, TRUE };
#endif /* BOOLEAN */

typedef void * LPVOID;
typedef unsigned long ULONG;
typedef long long _int64;
typedef void * VoidFunction (void *);

typedef struct thr_data {

    int threadno;
    int NumBlocks;
    long seed;

    int min_size;
    int max_size;

    char **array;
    long *blksize;
    int asize;

    int cAllocs;
    int cFrees;
    int cThreads;
    int cBytesAlloced;

    volatile int finished;
    struct lran2_st rgen;

} thread_data;

int volatile stopflag = FALSE;
int min_size = 10, max_size = 500;
struct lran2_st rgen;
char *blkp[MAX_BLOCKS];
long blksize[MAX_BLOCKS];

static void QueryPerformanceFrequency(long *x) {
    *x = 1000000L;
}

static void QueryPerformanceCounter (long *x) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    *x = tv.tv_sec * 1000000L + tv.tv_usec;
}

static void Sleep(long x) {
    //  printf ("sleeping for %ld seconds.\n", x/1000);
    sleep((unsigned int) (x/1000));
}

static void _beginthread(VoidFunction x, void * z) {
    pthread_t pt;
    pthread_attr_t pa;
    pthread_attr_init (&pa);

    //  printf ("creating a thread.\n");
    pthread_create(&pt, &pa, x, z);
}

static void warmup(char **blkp, int num_chunks) {
    int cblks;
    long victim;
    long blk_size;
    LPVOID tmp;

    for(cblks = 0; cblks < num_chunks; cblks++) {
        blk_size = min_size + lran2(&rgen) % (max_size - min_size);
        blkp[cblks] = (char *) malloc((size_t) blk_size);
        blksize[cblks] = blk_size;
        assert(blkp[cblks] != NULL);
    }

    /* generate a random permutation of the chunks */
    for(cblks = num_chunks; cblks > 0 ; cblks--) {
        victim = lran2(&rgen) % cblks;
        tmp = blkp[victim];
        blkp[victim]  = blkp[cblks-1];
        blkp[cblks-1] = (char *) tmp;
    }

    for(cblks=0; cblks < 4 * num_chunks; cblks++) {
        victim = lran2(&rgen) % num_chunks;
        free(blkp[victim]);

        blk_size = min_size + lran2(&rgen) % (max_size - min_size);
        blkp[victim] = (char *) malloc((size_t) blk_size);
        blksize[victim] = blk_size;
        assert(blkp[victim] != NULL);
    }
}

static void * exercise_heap( void *pinput) {
    thread_data  *pdea;
    int           cblks = 0;
    long          victim;
    long          blk_size;
    int           range;

    if( stopflag ) return 0;

    pdea = (thread_data *) pinput;
    pdea->finished = FALSE;
    pdea->cThreads++;
    range = pdea->max_size - pdea->min_size;

    /* allocate NumBlocks chunks of random size */
    for(cblks=0; cblks < pdea->NumBlocks; cblks++) {
        victim = lran2(&pdea->rgen)%pdea->asize;
        free(pdea->array[victim]);
        pdea->cFrees++;

        blk_size = pdea->min_size+lran2(&pdea->rgen)%range;
        pdea->array[victim] = (char *) malloc((size_t) blk_size);

        pdea->blksize[victim] = blk_size;
        assert(pdea->array[victim] != NULL);

        pdea->cAllocs++;

        /* Write something! */

        volatile char * chptr = ((char *) pdea->array[victim]);
        *chptr++ = 'a';
        volatile char ch = *((char *) pdea->array[victim]);
        *chptr = 'b';


        if( stopflag ) break;
    }

    //  	printf("Thread %u terminating: %d allocs, %d frees\n",
    //		      pdea->threadno, pdea->cAllocs, pdea->cFrees) ;
    pdea->finished = TRUE;

    if( !stopflag ) {
        _beginthread(exercise_heap, pdea);
    }

    return 0;
}


static void runthreads(long sleep_cnt, int min_threads, int max_threads, int chperthread, int num_rounds) {
    thread_data de_area[MAX_THREADS];
    thread_data *pdea;
    long ticks_per_sec;
    int prevthreads;
    int num_threads;
    int nperthread;
    int sum_threads;
    int sum_allocs;
    int sum_frees;

    int i;

    long start_cnt, end_cnt;
    _int64 ticks;
    double duration ;

    double rate_1 = 0, rate_n;
    size_t reqd_space;
    size_t used_space;	

    QueryPerformanceFrequency( &ticks_per_sec );

    pdea = &de_area[0];
    memset(&de_area[0], 0, sizeof(thread_data));	

    prevthreads = 0 ;
    for(num_threads=min_threads; num_threads <= max_threads; num_threads++) {

        warmup(&blkp[prevthreads*chperthread], (num_threads-prevthreads)*chperthread );

        nperthread = chperthread ;
        stopflag   = FALSE ;

        for(i = 0; i < num_threads; i++) {
            de_area[i].threadno    = i+1 ;
            de_area[i].NumBlocks   = num_rounds*nperthread;
            de_area[i].array       = &blkp[i*nperthread];
            de_area[i].blksize     = &blksize[i*nperthread];
            de_area[i].asize       = nperthread;
            de_area[i].min_size    = min_size;
            de_area[i].max_size    = max_size;
            de_area[i].seed        = lran2(&rgen);
            de_area[i].finished    = 0;
            de_area[i].cAllocs     = 0;
            de_area[i].cFrees      = 0;
            de_area[i].cThreads    = 0;
            de_area[i].finished    = FALSE;
            lran2_init(&de_area[i].rgen, de_area[i].seed);
            _beginthread(exercise_heap, &de_area[i]);
        }

        QueryPerformanceCounter( &start_cnt );

        printf ("Sleeping for %ld seconds.\n", sleep_cnt);
        Sleep(sleep_cnt * 1000L) ;

        stopflag = TRUE ;

        for(i = 0; i < num_threads; i++) {
            while( !de_area[i].finished ) {
                sched_yield();
            }
        }

        QueryPerformanceCounter( &end_cnt );

        sum_frees = sum_allocs =0  ;
        sum_threads = 0 ;
        for(i=0;i< num_threads; i++){
            sum_allocs    += de_area[i].cAllocs ;
            sum_frees     += de_area[i].cFrees ;
            sum_threads   += de_area[i].cThreads ;
            de_area[i].cAllocs = de_area[i].cFrees = 0;
        }

        ticks = end_cnt - start_cnt ;
        duration = (double)(ticks/ticks_per_sec);

        for(i = 0; i < num_threads; i++) {
            if( !de_area[i].finished ) {
                printf("Thread at %d not finished\n", i);
            }
        }

        rate_n = sum_allocs/duration ;
        if( rate_1 == 0){
            rate_1 = rate_n ;
        }
        //reqd_space = (0.5*(min_size+max_size)*num_threads*chperthread) ;
        //used_space = CountReservedSpace() - init_space;
        // FIXME Currently only one heap is used in the example
        used_space = get_allocated_space(&systemallocator.heaps[0]);
        reqd_space = get_used_space(&systemallocator.heaps[0]);
        //used_space = 0;
        printf(" Used space: %zu\n Requested space: %zu\n", used_space, reqd_space);

        printf("%2d ", num_threads ) ;
        printf("%6.3f", duration  ) ;
        printf("%6.3f", rate_n/rate_1 );
        printf("%8.0f", sum_allocs/duration);
        printf(" %6.3f %.3f", (double)(used_space/(1024*1024)), (used_space/reqd_space));
        printf("\n") ;

        Sleep(5000L) ; // wait 5 sec for old threads to die

        prevthreads = num_threads;
    }

}

int main(void) {
    long sleep_cnt;
    int min_threads, max_threads;
    int num_chunks = 10000;
    int num_rounds;
    int chperthread;

    printf("Larson benchmark\n");

    printf("runtime (sec): ") ;
    //scanf ("%ld", &sleep_cnt);
    sleep_cnt = 30;
    printf("%ld\n", sleep_cnt);

    printf("chunk size (min,max): ") ;
    //scanf("%d %d", &min_size, &max_size ) ;
    min_size = 32;
    max_size = 768;
    printf("%d %d\n", min_size, max_size);

    printf("threads (min, max):   ") ; 
    //scanf("%d %d", &min_threads, &max_threads) ;
    min_threads = 1;
    max_threads = 4;
    printf("%d %d\n", min_threads, max_threads);

    pthread_setconcurrency(max_threads);

    printf("chunks/thread:  ");
    //scanf("%d", &chperthread );
    chperthread = 10000;
    printf("%d\n", chperthread);

    num_chunks = max_threads * chperthread ;
    if( num_chunks > MAX_BLOCKS ){
        printf("Max %d chunks - exiting\n", MAX_BLOCKS ) ;
        return 1;
    }

    printf("no of rounds:   ");
    //scanf("%d", &num_rounds );
    num_rounds = 10;
    printf("%d\n", num_rounds);

    runthreads(sleep_cnt, min_threads, max_threads, chperthread, num_rounds) ;

    return 0;
}