/*
 * Copyright (c) 2009 The Trustees of Indiana University and Indiana
 *                    University Research and Technology
 *                    Corporation.  All rights reserved.
 *
 * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
 *
 */

#include "netgauge.h"
#ifdef NG_PTRN_MEMCPY
#include "hrtimer/hrtimer.h"
#include "ptrn_memcpy_cmdline.h"
#include <vector>
#include <time.h>
#include <algorithm>
#include "fullresult.h"
#include "statistics.h"
#include "ng_tools.hpp"


extern "C" {

/* this only exists to prevent compiler optimizations that remove the
 * cachewiper ;) */
char NG_Wiper_res=0;


extern struct ng_options g_options;

/* internal function prototypes */
static void memcpy_do_benchmarks(struct ng_module *module);

/**
 * comm. pattern description and function pointer table
 */
static struct ng_comm_pattern pattern_memcpy = {
   pattern_memcpy.name = "memcpy",
   pattern_memcpy.desc = "measures memory copy performance",
   pattern_memcpy.flags = 0,
   pattern_memcpy.do_benchmarks = memcpy_do_benchmarks
};

/**
 * register this comm. pattern for usage in main
 * program
 */
int register_pattern_memcpy() {
   ng_register_pattern(&pattern_memcpy);
   return 0;
}

static void memcpy_do_benchmarks(struct ng_module *module) {
  /** for collecting statistics */
  struct ng_statistics statistics;

  /** currently tested packet size and maximum */
  long data_size;

  /** number of times to test the current datasize */
  long test_count = g_options.testcount;

  /** counts up to test_count */
  int test_round = 0;

  /** how long does the test run? */
  time_t test_time, cur_test_time;

  /** number of tests run */
  int ovr_tests=0, ovr_bytes=0;

  //parse cmdline arguments
  struct ptrn_memcpy_cmd_struct args_info;
  //printf("The string I got: %s\n", g_options.ptrnopts);
  if (ptrn_memcpy_parser_string(g_options.ptrnopts, &args_info, "netgauge") != 0) {
    exit(EXIT_FAILURE);
  }

  long max_data_size = ng_min(g_options.max_datasize + module->headerlen, module->max_datasize);
  /*Allocating buffers for the source and destination*/
  /* get needed data buffer memory */
  ng_info(NG_VLEV1, "Allocating %d bytes for source data buffer", max_data_size);
  char *buffer1; 
  NG_MALLOC(module, char*, max_data_size, buffer1);

  ng_info(NG_VLEV2, "Initializing source buffer (make sure it's really allocated)");
  for (int i = 0; i < max_data_size; i++) buffer1[i] = 0xff;

 /* get needed data buffer memory */
  ng_info(NG_VLEV1, "Allocating %d bytes for destination data buffer", max_data_size);
  char *buffer2; 
  NG_MALLOC(module, char*, max_data_size, buffer2);

  ng_info(NG_VLEV2, "Initializing destination buffer (make sure it's really allocated)");
  for (int i = 0; i < max_data_size; i++) buffer2[i] = 0xff;

 /* get needed data buffer memory */
  static const int cachewiper_size = 1024*1024*20; // 20 MiB should be enough for now
  ng_info(NG_VLEV1, "Allocating %d bytes for cache wipe buffer", cachewiper_size);
  char *cachewiper; 
  NG_MALLOC(module, char*, cachewiper_size, cachewiper);


  int rank = g_options.mpi_opts->worldrank;
  g_options.mpi_opts->partner = (rank+1)%2;
  
  /* buffer for header ... */
  char* txtbuf = (char *)malloc(2048 * sizeof(char));
  if (txtbuf == NULL) {
    ng_error("Could not (re)allocate 2048 byte for output buffer");
    ng_exit(10);
  }
  memset(txtbuf, '\0', 2048);

  /* header printing */
  if(rank == 0) {
    // if very verbose - long output
    if (NG_VLEV2 & g_options.verbose) {
      snprintf(txtbuf, 2047,
	      "## Netgauge v%s - mode %s - 2 processes\n"
	      "##\n"
        "## A...message size [byte]\n"
        "##\n"
        "## H...minimum RTT/2\n"
        "## I...average RTT/2\n"
        "## J...median RTT/2\n"
        "## K...maximum RTT/2\n"
        "## L...standard deviation for RTT/2 (stddev)\n"
        "## M...number of RTT/2 values, that were bigger than avg + 2 * stddev.\n"
        "##\n"
        "## N...minimum throughput [Mbit/sec]\n"
        "## O...average throughput [Mbit/sec]\n"
        "## P...median throughput [Mbit/sec]\n"
        "## Q...maximum throughput [Mbit/sec]\n"
        "##\n"
        "## A  -  B  C  D  E  (F G) - H  I  J  K  (L M)  -  N  O  P  Q\n",
        NG_VERSION,
        g_options.mode);

      printf("%s", txtbuf);
    } else
    // if verbose - short output
    if (NG_VLEV1 & g_options.verbose) {
      snprintf(txtbuf, 2047,
	      "## Netgauge v%s - mode %s - 2 processes\n"
	      "##\n"
        "## A...message size [byte]\n"
        "##\n"
        "## F...minimum RTT/2\n"
        "## G...average RTT/2\n"
        "## H...median RTT/2\n"
        "## I...maximum RTT/2\n"
        "##\n"
        "## J...minimum throughput [Mbit/sec]\n"
        "## K...average throughput [Mbit/sec]\n"
        "## L...median throughput [Mbit/sec]\n"
        "## M...maximum throughput [Mbit/sec]\n"
        "##\n"
        "## A  -  B  C  D  E - F  G  H  I - J  K  L  M\n",
        NG_VERSION,
        g_options.mode);

      printf("%s", txtbuf);
      
    } else
    // if not verbose - short output
    {
      // no header ...
    }

  }
  
  /* Outer test loop
   * - geometrically increments data_size (i.e. data_size = data_size * 2)
   *  (- geometrically decrements test_count) not yet implemented
   */
  for (data_size = g_options.min_datasize; data_size > 0;
	    get_next_testparams(&data_size, &test_count, &g_options, module)) {
    if(data_size == -1) goto shutdown;
    
    ++test_round;

    // the benchmark results
    std::vector<double> trtt;
    
    ng_info(NG_VLEV1, "Round %d: testing %d times with %d bytes:", test_round, test_count, data_size);
    // if we print dots ...
    if ( (rank==0) && (NG_VLEV1 & g_options.verbose) ) {
      printf("# ");
    }

    /* Inner test loop
     * - run the requested number of tests for the current data size
     * - but only if testtime does not exceed the max. allowed test time
     *   (only if max. test time is not zero)
     */
    test_time = 0;
    for (int test = -1 /* 1 warmup test */; test < test_count; test++) {
	
	    /* first statement to prevent floating exception */
      /* TODO: add cool abstract dot interface ;) */
	    if ( (NG_VLEV1 & g_options.verbose) && ( test_count < NG_DOT_COUNT || !(test % (int)(test_count / NG_DOT_COUNT)) )) {
	      printf(".");
	      fflush(stdout);
	    }
	
	    cur_test_time = time(NULL);

      /* do the client stuff ... take time, send message, wait for
       * reply and take time  ... simple ping-pong scheme */
      HRT_TIMESTAMP_T t[3];  
      unsigned long long tirtt;
  
      /* init statistics (TODO: what does this do?) */
      ng_statistics_test_begin(&statistics);

      if(args_info.wipe_given) {
        // wipe cache before doing the copy - just read and write the
        // whole cachewiper buffer and hope that it really wipes
        // D'oh - smart compilers might wipe the wiper out ... read the
        // data to an external variable at the end!
        for(int w=0; w<cachewiper_size; w++) {
          cachewiper[w] = cachewiper[w]++;
        }
      }

      HRT_GET_TIMESTAMP(t[0]);
  
  	  memcpy(buffer1,buffer2,data_size);

      HRT_GET_TIMESTAMP(t[2]);
      HRT_GET_ELAPSED_TICKS(t[0],t[2],&tirtt);
  
      /* calculate results */
      if(test >= 0) {
        trtt.push_back(HRT_GET_USEC(tirtt)/2);
      }
	    test_time += time(NULL) - cur_test_time;
	   

	    /* calculate overall statistics */
	    ovr_tests++;
	    ovr_bytes += data_size;
	 
	    /* measure test time and quit test if 
	     * test time exceeds max. test time
	     * but not if the max. test time is zero
	    */
	    if ( (g_options.max_testtime > 0) && 
	         (test_time > g_options.max_testtime) ) {
	      ng_info(NG_VLEV2, "Round %d exceeds %d seconds (duration %d seconds)", test_round, g_options.max_testtime, test_time);
	      ng_info(NG_VLEV2, "Test stopped at %d tests", test);
	      break;
	    }
	 
    }	/* end inner test loop */
    
	  if (rank==0) {
      /* add linebreak if we made dots ... */
      if ( (NG_VLEV1 & g_options.verbose) ) {
        ng_info(NG_VLEV1, "\n");
      }

      /* output statistics - rtt time */
      double trtt_avg = std::accumulate(trtt.begin(), trtt.end(), (double)0)/(double)trtt.size(); 
      double trtt_min = *min_element(trtt.begin(), trtt.end()); 
      double trtt_max = *max_element(trtt.begin(), trtt.end()); 
      std::vector<double>::iterator nthrtt = trtt.begin()+trtt.size()/2;
      std::nth_element(trtt.begin(), nthrtt, trtt.end());
      double trtt_med = *nthrtt;
      double trtt_var = standard_deviation(trtt.begin(), trtt.end(), trtt_avg);
      int trtt_fail = count_range(trtt.begin(), trtt.end(), trtt_avg-trtt_var*2, trtt_avg+trtt_var*2);

      // if very verbose - long output
      if (NG_VLEV2 & g_options.verbose) {
        memset(txtbuf, '\0', 2048);
        snprintf(txtbuf, 2047,
	        "%ld -  %.2lf %.2lf %.2lf %.2lf (%.2lf %i) - %.2lf %.2lf %.2lf %.2lf\n",
          data_size, /* packet size */

          trtt_min, /* minimum RTT time */
          trtt_avg, /* average RTT time */
          trtt_med, /* median RTT time */
          trtt_max, /* maximum RTT time */
          
          trtt_var, /* standard deviation */
          trtt_fail, /* how many are bigger than twice the standard deviation? */

          data_size/trtt_max*8, /* minimum bandwidth */
          data_size/trtt_avg*8, /* average bandwidth */
          data_size/trtt_med*8, /* median bandwidth */
          data_size/trtt_min*8 /* maximum bandwidth */
          );
        printf("%s", txtbuf);
        
      } else
      // if verbose - short output
      if (NG_VLEV1 & g_options.verbose) {
        memset(txtbuf, '\0', 2048);
        snprintf(txtbuf, 2047,
	        "%ld - %.2lf %.2lf %.2lf %.2lf - %.2lf %.2lf %.2lf %.2lf\n",
          data_size, /* packet size */
          
          trtt_min, /* minimum RTT time */
          trtt_avg, /* average RTT time */
          trtt_med, /* median RTT time */
          trtt_max, /* maximum RTT time */
          
          data_size/trtt_max*8, /* minimum bandwidth */
          data_size/trtt_avg*8, /* average bandwidth */
          data_size/trtt_med*8, /* median bandwidth */
          data_size/trtt_min*8 /* maximum bandwidth */
          );
        printf("%s", txtbuf);
        
      } else
      // if not verbose - short output
      {
        memset(txtbuf, '\0', 2048);
        snprintf(txtbuf, 2047,
	        "%ld bytes \t -> %.2lf us \t == %.2lf Mbit/s\n",
          data_size, /* packet size */
          //trtt_med, /* median RTT time */
          trtt_min, /* minimum RTT time */
          data_size/trtt_med*8 /* median bandwidth */
          );
        printf("%s", txtbuf);
        
      }
    }

    ng_info(NG_VLEV1, "\n");
    fflush(stdout);

  }	/* end outer test loop */

  for(int w=0; w<cachewiper_size; w++) {
    NG_Wiper_res += cachewiper[w];
  }

   
 shutdown:
   if(txtbuf) free(txtbuf);
    
}

} /* extern C */

#else
int register_pattern_memcpy(void) {return 0;};
#endif
