wolf's improvements #2

sgminer-dev · Mar 25, 2015 · fe62dc7 · fe62dc7
1 parent ae87ca6
commit fe62dc7
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 100 deletions.
diff --git a/algorithm.c b/algorithm.c
@@ -655,10 +655,14 @@ static cl_int queue_whirlpoolx_kernel(struct __clState *clState, struct _dev_blk
     tmp[0] = 0;
     whirlpool_round(midblock, tmp);
 
-    for (int x = 0; x < 8; ++x) midblock[x] ^= key[x];
+    for (int x = 0; x < 8; ++x) {
+      midblock[x] ^= key[x];
+    }
   }
 
-  for (int i = 0; i < 8; ++i) midblock[i] ^= ((uint64_t *)(clState->cldata))[i];
+  for (int i = 0; i < 8; ++i) {
+    midblock[i] ^= ((uint64_t *)(clState->cldata))[i];
+  }
 
   status = clSetKernelArg(clState->kernel, 0, sizeof(cl_ulong8), (cl_ulong8 *)&midblock);
   status |= clSetKernelArg(clState->kernel, 1, sizeof(cl_ulong), (void *)(((uint64_t *)clState->cldata) + 8));
@@ -732,27 +736,6 @@ static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_un
   return status;
 }
 
-typedef struct _algorithm_settings_t {
-  const char *name; /* Human-readable identifier */
-  algorithm_type_t type; //common algorithm type
-  const char *kernelfile; /* alternate kernel file */
-  double   diff_multiplier1;
-  double   diff_multiplier2;
-  double   share_diff_multiplier;
-  uint32_t xintensity_shift;
-  uint32_t intensity_shift;
-  uint32_t found_idx;
-  unsigned long long   diff_numerator;
-  uint32_t diff1targ;
-  size_t n_extra_kernels;
-  long rw_buffer_size;
-  cl_command_queue_properties cq_properties;
-  void(*regenhash)(struct work *);
-  cl_int(*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint);
-  void(*gen_hash)(const unsigned char *, unsigned int, unsigned char *);
-  void(*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *);
-} algorithm_settings_t;
-
 static algorithm_settings_t algos[] = {
   // kernels starting from this will have difficulty calculated by using litecoin algorithm
 #define A_SCRYPT(a) \
@@ -895,7 +878,6 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
   ALGO_ALIAS("nist5", "talkcoin-mod");
   ALGO_ALIAS("keccak", "maxcoin");
   ALGO_ALIAS("whirlpool", "whirlcoin");
-  ALGO_ALIAS("whirlpoolx", "whirlpoolx");
   ALGO_ALIAS("Lyra2RE", "lyra2re");
   ALGO_ALIAS("lyra2", "lyra2re");
 
@@ -957,8 +939,7 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor)
   }
 }
 
-bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2)
+bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2)
 {
-  // return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor);
   return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor));
 }
diff --git a/algorithm.h b/algorithm.h
@@ -9,6 +9,7 @@
 
 #include <inttypes.h>
 #include <stdbool.h>
+#include "ocl/build_kernel.h"   // For the build_kernel_data type
 
 typedef enum {
   ALGO_UNK,
@@ -25,8 +26,8 @@ typedef enum {
   ALGO_NIST,
   ALGO_FRESH,
   ALGO_WHIRL,
-  ALGO_WHIRLPOOLX,
   ALGO_NEOSCRYPT,
+  ALGO_WHIRLPOOLX,
   ALGO_LYRA2RE,
   ALGO_PLUCK
 } algorithm_type_t;
@@ -67,13 +68,35 @@ typedef struct _algorithm_t {
   void(*set_compile_options)(struct _build_kernel_data *, struct cgpu_info *, struct _algorithm_t *);
 } algorithm_t;
 
+typedef struct _algorithm_settings_t
+{
+	const char *name;
+	algorithm_type_t type;
+	const char *kernelfile;
+	double   diff_multiplier1;
+	double   diff_multiplier2;
+	double   share_diff_multiplier;
+	uint32_t xintensity_shift;
+	uint32_t intensity_shift;
+	uint32_t found_idx;
+	unsigned long long   diff_numerator;
+	uint32_t diff1targ;
+	size_t n_extra_kernels;
+	long rw_buffer_size;
+	cl_command_queue_properties cq_properties;
+	void     (*regenhash)(struct work *);
+	cl_int   (*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint);
+	void     (*gen_hash)(const unsigned char *, unsigned int, unsigned char *);
+	void     (*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *);
+} algorithm_settings_t;
+
 /* Set default parameters based on name. */
 void set_algorithm(algorithm_t* algo, const char* name);
 
 /* Set to specific N factor. */
 void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor);
 
 /* Compare two algorithm parameters */
-bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2);
+bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2);
 
 #endif /* ALGORITHM_H */
diff --git a/ocl.c b/ocl.c
@@ -146,16 +146,6 @@ static cl_int create_opencl_context(cl_context *context, cl_platform_id *platfor
   return status;
 }
 
-static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, cl_command_queue_properties cq_properties)
-{
-  cl_int status;
-  *command_queue = clCreateCommandQueue(*context, *device,
-    cq_properties, &status);
-  if (status != CL_SUCCESS) /* Try again without OOE enable */
-    *command_queue = clCreateCommandQueue(*context, *device, 0, &status);
-  return status;
-}
-
 static float get_opencl_version(cl_device_id device)
 {
   /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
@@ -193,27 +183,56 @@ static bool get_opencl_bit_align_support(cl_device_id *device)
   return !!find;
 }
 
-_clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm)
+static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, const void *cq_properties)
 {
-  _clState *clState = (_clState *)calloc(1, sizeof(_clState));
-  struct cgpu_info *cgpu = &gpus[gpu];
-  cl_platform_id platform = NULL;
-  char pbuff[256];
-  build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data));
-  cl_uint preferred_vwidth;
-  cl_device_id *devices;
-  cl_uint numDevices;
-  cl_int status;
+	cl_int status;
+
+	if(get_opencl_version(*device) < 2.0)	{
+		*command_queue = clCreateCommandQueue(*context, *device, *((const cl_command_queue_properties *)cq_properties), &status);
+
+		// Didn't work, try again with no properties.
+    if (status != CL_SUCCESS) {
+      *command_queue = clCreateCommandQueue(*context, *device, 0, &status);
+    }
+	}
+	else {
+		*command_queue = clCreateCommandQueueWithProperties(*context, *device, (const cl_queue_properties *)cq_properties, &status);
+
+		// Didn't work, same deal.
+    if (status != CL_SUCCESS) {
+      *command_queue = clCreateCommandQueueWithProperties(*context, *device, 0, &status);
+    }
+	}
+
+	return status;
+}
 
+_clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm)
+{
+  cl_int status = 0;
+	size_t compute_units = 0;
+	cl_platform_id platform = NULL;
+	struct cgpu_info *cgpu = &gpus[gpu];
+	_clState *clState = (_clState *)calloc(1, sizeof(_clState));
+	cl_uint preferred_vwidth, slot = 0, cpnd = 0, numDevices = clDevicesNum();
+	cl_device_id *devices = (cl_device_id *)alloca(numDevices * sizeof(cl_device_id));
+	build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data));
+	char **pbuff = (char **)alloca(sizeof(char *) * numDevices), filename[256];
+
+  // sanity check
   if (!get_opencl_platform(opt_platform_id, &platform)) {
     return NULL;
   }
 
-  numDevices = clDevicesNum();
+  if (numDevices <= 0) {
+    return NULL;
+  }
 
-  if (numDevices <= 0) return NULL;
+  if (gpu >= numDevices) {
+    applog(LOG_ERR, "Invalid GPU %i", gpu);
+    return NULL;
+  }
 
-  devices = (cl_device_id *)alloca(numDevices*sizeof(cl_device_id));
 
   /* Now, get the device list data */
 
@@ -225,34 +244,33 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
 
   applog(LOG_INFO, "List of devices:");
 
-  unsigned int i;
-  for (i = 0; i < numDevices; i++) {
-    status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
-    if (status != CL_SUCCESS) {
-      applog(LOG_ERR, "Error %d: Getting Device Info", status);
+  for (int i = 0; i < numDevices; ++i)	{
+    size_t tmpsize;
+    if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &tmpsize) != CL_SUCCESS) {
+      applog(LOG_ERR, "Error while getting the length of the name for GPU #%d.", i);
       return NULL;
     }
 
-    applog(LOG_INFO, "\t%i\t%s", i, pbuff);
-
-    if (i == gpu) {
-      applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
-      strncpy(name, pbuff, nameSize);
+    // Does the size include the NULL terminator? Who knows, just add one, it's faster than looking it up.
+    pbuff[i] = (char *)alloca(sizeof(char) * (tmpsize + 1));
+    if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(char) * tmpsize, pbuff[i], NULL) != CL_SUCCESS) {
+      applog(LOG_ERR, "Error while attempting to get device information.");
+		  return NULL;
     }
-  }
-
-  if (gpu >= numDevices) {
-    applog(LOG_ERR, "Invalid GPU %i", gpu);
-    return NULL;
-  }
 
+    applog(LOG_INFO, "\t%i\t%s", i, pbuff[i]);
+	}
+
+	applog(LOG_INFO, "Selected %d: %s", gpu, pbuff[gpu]);
+  strncpy(name, pbuff[gpu], nameSize);
+
   status = create_opencl_context(&clState->context, &platform);
   if (status != CL_SUCCESS) {
     applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
     return NULL;
   }
 
-  status = create_opencl_command_queue(&clState->commandQueue, &clState->context, &devices[gpu], cgpu->algorithm.cq_properties);
+  status = create_opencl_command_queue(&clState->commandQueue, &clState->context, &devices[gpu], (const void *)&(cgpu->algorithm.cq_properties));
   if (status != CL_SUCCESS) {
     applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
     return NULL;
@@ -274,16 +292,17 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   }
   applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size));
 
-  size_t compute_units = 0;
   status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL);
   if (status != CL_SUCCESS) {
     applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status);
     return NULL;
   }
   // AMD architechture got 64 compute shaders per compute unit.
   // Source: http://www.amd.com/us/Documents/GCN_Architecture_whitepaper.pdf
-  clState->compute_shaders = compute_units * 64;
-  applog(LOG_DEBUG, "Max shaders calculated %d", (int)(clState->compute_shaders));
+  clState->compute_shaders = compute_units << 6;
+  applog(LOG_INFO, "Maximum work size for this GPU (%d) is %d.", gpu, clState->max_work_size);
+	applog(LOG_INFO, "Your GPU (#%d) has %d compute units, and all AMD cards in the 7 series or newer (GCN cards) \
+		have 64 shaders per compute unit - this means it has %d shaders.", gpu, compute_units, clState->compute_shaders);
 
   status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
   if (status != CL_SUCCESS) {
@@ -297,12 +316,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
    * would have otherwise created. The filename is:
    * name + g + lg + lookup_gap + tc + thread_concurrency + nf + nfactor + w + work_size + l + sizeof(long) + .bin
    */
-  char filename[255];
-  char strbuf[32];
-
-  sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
-  strcpy(filename, strbuf);
 
+  sprintf(filename, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
   applog(LOG_DEBUG, "Using source file %s", filename);
 
   /* For some reason 2 vectors is still better even if the card says
@@ -326,10 +341,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
 
   clState->goffset = true;
 
-  if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
-    clState->wsize = cgpu->work_size;
-  else
-    clState->wsize = 256;
+  clState->wsize = (cgpu->work_size && cgpu->work_size <= clState->max_work_size) ? cgpu->work_size : 256;
 
   if (!cgpu->opt_lg) {
     applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
@@ -536,38 +548,32 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     cgpu->thread_concurrency = cgpu->opt_tc;
   }
 
-  cl_uint slot, cpnd;
-
-  slot = cpnd = 0;
-
   build_data->context = clState->context;
   build_data->device = &devices[gpu];
 
   // Build information
   strcpy(build_data->source_filename, filename);
-  strcpy(build_data->platform, name);
-  strcpy(build_data->sgminer_path, sgminer_path);
-  if (opt_kernel_path && *opt_kernel_path) {
-    build_data->kernel_path = opt_kernel_path;
-  }
-  else {
-    build_data->kernel_path = NULL;
-  }
+	strcpy(build_data->platform, name);
+	strcpy(build_data->sgminer_path, sgminer_path);
 
+  build_data->kernel_path = (*opt_kernel_path) ? opt_kernel_path : NULL;
   build_data->work_size = clState->wsize;
   build_data->has_bit_align = clState->hasBitAlign;
-
   build_data->opencl_version = get_opencl_version(devices[gpu]);
   build_data->patch_bfi = needs_bfi_patch(build_data);
 
-  strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
-  strcat(build_data->binary_filename, name);
-  if (clState->goffset)
+  strcpy(build_data->binary_filename, filename);
+	build_data->binary_filename[strlen(filename) - 3] = 0x00;		// And one NULL terminator, cutting off the .cl suffix.
+	strcat(build_data->binary_filename, pbuff[gpu]);
+
+  if (clState->goffset) {
     strcat(build_data->binary_filename, "g");
+  }
 
   set_base_compiler_options(build_data);
-  if (algorithm->set_compile_options)
+  if (algorithm->set_compile_options) {
     algorithm->set_compile_options(build_data, cgpu, algorithm);
+  }
 
   strcat(build_data->binary_filename, ".bin");
   applog(LOG_DEBUG, "Using binary file %s", build_data->binary_filename);
@@ -576,8 +582,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   if (!(clState->program = load_opencl_binary_kernel(build_data))) {
     applog(LOG_NOTICE, "Building binary %s", build_data->binary_filename);
 
-    if (!(clState->program = build_opencl_kernel(build_data, filename)))
+    if (!(clState->program = build_opencl_kernel(build_data, filename))) {
       return NULL;
+    }
 
     if (save_opencl_kernel(build_data, clState->program)) {
       /* Program needs to be rebuilt, because the binary was patched */

diff --git a/ocl.h b/ocl.h
@@ -10,7 +10,7 @@
 #include <CL/cl.h>
 #endif
 
-#include "miner.h"
+#include "algorithm.h"
 
 typedef struct __clState {
   cl_context context;
@@ -21,6 +21,7 @@ typedef struct __clState {
   cl_program program;
   cl_mem outputBuffer;
   cl_mem CLbuffer0;
+  cl_mem MidstateBuf;
   cl_mem padbuffer8;
   unsigned char cldata[80];
   bool hasBitAlign;

diff --git a/ocl/binary_kernel.c b/ocl/binary_kernel.c
@@ -1,5 +1,7 @@
 #include "binary_kernel.h"
+#include "miner.h"
 #include <sys/stat.h>
+#include <stdio.h>
 
 cl_program load_opencl_binary_kernel(build_kernel_data *data)
 {

diff --git a/ocl/build_kernel.c b/ocl/build_kernel.c
@@ -1,5 +1,7 @@
+#include <stdio.h>
 #include "build_kernel.h"
 #include "patch_kernel.h"
+#include "miner.h"
 
 static char *file_contents(const char *filename, int *length)
 {