From 1a6f7bc8c7814421a8658b4b29f1232c5506842a Mon Sep 17 00:00:00 2001 From: Hoonlim Lee Date: Thu, 2 Jul 2026 22:51:01 +0800 Subject: [PATCH 1/3] Fixed threading to allow multiple thread to spawn --- include/cneuron/cneuron.h | 10 +++++ src/main.c | 2 +- src/network.c | 78 +++++++++++++++++++++++++++++---------- 3 files changed, 69 insertions(+), 21 deletions(-) diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h index 6cafbccdf..cc20d7699 100644 --- a/include/cneuron/cneuron.h +++ b/include/cneuron/cneuron.h @@ -114,6 +114,7 @@ void hadamard_product(const float *restrict a, const float *restrict b, float *r typedef struct { size_t length; /**< Number of layers in the network. */ size_t inputs_length; /**< Number of inputs to the network. */ + size_t total_allocated_memory; /**< The total memory allocated for the whole neural network. */ size_t *layer_lengths; /**< Number of neuron in each layer. */ size_t *prev_lengths_sums; /**< Number of neuron from all previous layer. */ size_t *prev_weights_sums; /**< Number of weights from all previous layer. */ @@ -148,6 +149,15 @@ neural_network *alloc_neural_network(size_t network_length, const size_t *layers */ neural_network *get_neural_network(size_t network_length, const size_t *layers_length, size_t inputs_length, float (*activation_function)(float, bool)); +/** + * @brief Allocates and copy a new neural network. + * + * @param neural_network The source to copy from + * + * @return Pointer to the newly created neural network. + */ +neural_network *copy_neural_network(const neural_network *nn); + /** * @brief Computes the output of the neural network for the given inputs. * diff --git a/src/main.c b/src/main.c index 25e323997..50f8e3722 100644 --- a/src/main.c +++ b/src/main.c @@ -137,7 +137,7 @@ int main(int argc, char **argv) { // Parameters const float learn_rate = 1.5f; - const size_t batch_size = 30; + const size_t batch_size = 3000; const int learn_amount = 50000000; const int batch_amount = learn_amount / batch_size; const int log_amount = 1000; // Log once reached a number of batch diff --git a/src/network.c b/src/network.c index 978b2f184..2422c2204 100644 --- a/src/network.c +++ b/src/network.c @@ -21,10 +21,12 @@ neural_network *alloc_neural_network(size_t network_length, const size_t *layers size_t prev_length = (i == 0) ? inputs_length : layers_length[i - 1]; total_float += layers_length[i] * 4 + layers_length[i] * prev_length; } - neural_network *nn = calloc(1, sizeof(neural_network) + sizeof(size_t) * (network_length * 3 + 2) + sizeof(float) * total_float); + size_t total_memory = sizeof(neural_network) + sizeof(size_t) * (network_length * 3 + 2) + sizeof(float) * total_float; + neural_network *nn = calloc(1, total_memory); if (!nn) return NULL; nn->length = network_length; nn->inputs_length = inputs_length; + nn->total_allocated_memory = total_memory; nn->layer_lengths = (size_t *)(nn + 1); nn->prev_lengths_sums = nn->layer_lengths + network_length; nn->prev_weights_sums = nn->prev_lengths_sums + network_length + 1; @@ -65,6 +67,22 @@ neural_network *get_neural_network(size_t network_length, const size_t *layers_l return nn; } +neural_network *copy_neural_network(const neural_network *nn) { + neural_network *new_nn = malloc(nn->total_allocated_memory); + memcpy(new_nn, nn, nn->total_allocated_memory); + size_t network_length = nn->length; + new_nn->layer_lengths = (size_t *)(new_nn + 1); + new_nn->prev_lengths_sums = new_nn->layer_lengths + network_length; + new_nn->prev_weights_sums = new_nn->prev_lengths_sums + network_length + 1; + new_nn->delta = (float *)(new_nn->prev_weights_sums + network_length + 1); + size_t total_l_sum = nn->prev_lengths_sums[network_length]; + new_nn->weighted_input = new_nn->delta + total_l_sum; + new_nn->output = new_nn->weighted_input + total_l_sum; + new_nn->bias = new_nn->output + total_l_sum; + new_nn->weights = new_nn->bias + total_l_sum; + return new_nn; +} + void compute_network(const neural_network *restrict nn, const float *restrict inputs) { assert(nn && inputs); cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, nn->layer_lengths[0], 1, nn->inputs_length, 1.0f, nn->weights, nn->layer_lengths[0], inputs, nn->inputs_length, 0.0f, nn->weighted_input, nn->layer_lengths[0]); @@ -261,8 +279,8 @@ void stochastic_gd(const neural_network *nn, float learn_rate, const float *data typedef struct { neural_network *nn; const dataset *data_batch; - size_t start; - size_t end; + size_t start_index; + size_t end_index; float **weights_gradients; float **bias_gradients; int thread_index; @@ -281,7 +299,7 @@ void *thread_worker(void *arg) { bias_gradients[i] = calloc(nn->layer_lengths[i], sizeof(float)); } - for (size_t i = 0; i < args->data_batch->length; i++) { + for (size_t i = args->start_index; i < args->end_index; i++) { float *data = &args->data_batch->all_inputs[i * args->data_batch->inputs_length]; compute_network(nn, data); @@ -294,23 +312,36 @@ void *thread_worker(void *arg) { return NULL; } +#define THREAD_COUNT 4 + void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { assert(nn && data_batch); - ThreadArgs args; - - float **weights_gradients = malloc(nn->length * sizeof(float *)); - float **bias_gradients = malloc(nn->length * sizeof(float *)); +#ifdef USE_THREADING + pthread_t threads[THREAD_COUNT]; +#endif + ThreadArgs args[THREAD_COUNT]; + size_t chunk_size = data_batch->length / THREAD_COUNT; + for (int i = 0; i < THREAD_COUNT; i++) { + args[i].nn = copy_neural_network(nn); + args[i].data_batch = data_batch; + args[i].start_index = i * chunk_size; + args[i].end_index = (i == THREAD_COUNT - 1) ? data_batch->length : (i + 1) * chunk_size; + args[i].weights_gradients = malloc(nn->length * sizeof(float *)); + args[i].bias_gradients = malloc(nn->length * sizeof(float *)); - args = (ThreadArgs){.nn = nn, .data_batch = data_batch, .weights_gradients = weights_gradients, .bias_gradients = bias_gradients}; +#ifdef USE_THREADING + pthread_create(&threads[i], NULL, thread_worker, &args[i]); +#endif + } + for (int t = 0; t < THREAD_COUNT; t++) { #ifdef USE_THREADING - pthread_t thread; - pthread_create(&thread, NULL, thread_worker, &args); - pthread_join(thread, NULL); + pthread_join(threads[t], NULL); #else - thread_worker(&args); + thread_worker(&args[t]); #endif + } for (size_t i = 0; i < nn->length; i++) { size_t len = nn->layer_lengths[i]; @@ -318,20 +349,27 @@ void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_bat size_t w_sum = nn->prev_weights_sums[i]; size_t weights_size = nn->prev_weights_sums[i + 1] - nn->prev_weights_sums[i]; for (size_t j = 0; j < weights_size; j++) { - nn->weights[w_sum + j] -= weights_gradients[i][j] / data_batch->length * learn_rate; + for (int t = 0; t < THREAD_COUNT; t++) { + nn->weights[w_sum + j] -= args[t].weights_gradients[i][j] / data_batch->length * learn_rate; + } } for (size_t j = 0; j < len; j++) { - nn->bias[l_sum + j] -= (bias_gradients[i][j] / data_batch->length) * learn_rate; + for (int t = 0; t < THREAD_COUNT; t++) { + nn->bias[l_sum + j] -= args[t].bias_gradients[i][j] / data_batch->length * learn_rate; + } } } - for (size_t i = 0; i < nn->length; i++) { - free(weights_gradients[i]); - free(bias_gradients[i]); + for (int i = 0; i < THREAD_COUNT; i++) { + for (size_t j = 0; j < nn->length; j++) { + free(args[i].weights_gradients[j]); + free(args[i].bias_gradients[j]); + } + free(args[i].weights_gradients); + free(args[i].bias_gradients); + free(args[i].nn); } - free(weights_gradients); - free(bias_gradients); } bool save_network(const char *restrict filename, const neural_network *restrict nn) { From 1dc68c29a553b5b7fbd4043c4c1ce98bac79fc1c Mon Sep 17 00:00:00 2001 From: Hoonlim Lee Date: Fri, 3 Jul 2026 10:11:43 +0800 Subject: [PATCH 2/3] Improved memory realignment method --- src/network.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/network.c b/src/network.c index 2422c2204..1309d0d1e 100644 --- a/src/network.c +++ b/src/network.c @@ -68,18 +68,19 @@ neural_network *get_neural_network(size_t network_length, const size_t *layers_l } neural_network *copy_neural_network(const neural_network *nn) { + if (!nn) return NULL; neural_network *new_nn = malloc(nn->total_allocated_memory); + if (!new_nn) return NULL; memcpy(new_nn, nn, nn->total_allocated_memory); - size_t network_length = nn->length; - new_nn->layer_lengths = (size_t *)(new_nn + 1); - new_nn->prev_lengths_sums = new_nn->layer_lengths + network_length; - new_nn->prev_weights_sums = new_nn->prev_lengths_sums + network_length + 1; - new_nn->delta = (float *)(new_nn->prev_weights_sums + network_length + 1); - size_t total_l_sum = nn->prev_lengths_sums[network_length]; - new_nn->weighted_input = new_nn->delta + total_l_sum; - new_nn->output = new_nn->weighted_input + total_l_sum; - new_nn->bias = new_nn->output + total_l_sum; - new_nn->weights = new_nn->bias + total_l_sum; + ptrdiff_t diff = (char *)new_nn - (char *)nn; + new_nn->layer_lengths = (size_t *)((char *)nn->layer_lengths + diff); + new_nn->prev_lengths_sums = (size_t *)((char *)nn->prev_lengths_sums + diff); + new_nn->prev_weights_sums = (size_t *)((char *)nn->prev_weights_sums + diff); + new_nn->delta = (float *)((char *)nn->delta + diff); + new_nn->weighted_input = (float *)((char *)nn->weighted_input + diff); + new_nn->output = (float *)((char *)nn->output + diff); + new_nn->bias = (float *)((char *)nn->bias + diff); + new_nn->weights = (float *)((char *)nn->weights + diff); return new_nn; } From f7e060b74c630cb87e3842d7bb09084c0360053f Mon Sep 17 00:00:00 2001 From: Hoonlim Lee Date: Fri, 3 Jul 2026 11:49:16 +0800 Subject: [PATCH 3/3] Made bias_gradients and weights_gradients flat --- include/cneuron/cneuron.h | 2 +- src/main.c | 2 +- src/network.c | 35 ++++++++++++++--------------------- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h index cc20d7699..dd60ad8b7 100644 --- a/include/cneuron/cneuron.h +++ b/include/cneuron/cneuron.h @@ -114,7 +114,7 @@ void hadamard_product(const float *restrict a, const float *restrict b, float *r typedef struct { size_t length; /**< Number of layers in the network. */ size_t inputs_length; /**< Number of inputs to the network. */ - size_t total_allocated_memory; /**< The total memory allocated for the whole neural network. */ + size_t total_allocated_memory; /**< The total memory allocated for the whole neural network. */ size_t *layer_lengths; /**< Number of neuron in each layer. */ size_t *prev_lengths_sums; /**< Number of neuron from all previous layer. */ size_t *prev_weights_sums; /**< Number of weights from all previous layer. */ diff --git a/src/main.c b/src/main.c index 50f8e3722..c1b99b652 100644 --- a/src/main.c +++ b/src/main.c @@ -136,7 +136,7 @@ int main(int argc, char **argv) { neural_network *nn = get_neural_network(network_length, layer_lengths, train_dataset->inputs_length, &sigmoid); // Parameters - const float learn_rate = 1.5f; + const float learn_rate = 10.0f; const size_t batch_size = 3000; const int learn_amount = 50000000; const int batch_amount = learn_amount / batch_size; diff --git a/src/network.c b/src/network.c index 1309d0d1e..a929f4fb6 100644 --- a/src/network.c +++ b/src/network.c @@ -278,27 +278,21 @@ void stochastic_gd(const neural_network *nn, float learn_rate, const float *data } typedef struct { + int thread_index; neural_network *nn; const dataset *data_batch; size_t start_index; size_t end_index; - float **weights_gradients; - float **bias_gradients; - int thread_index; + float *bias_gradients; + float *weights_gradients; } ThreadArgs; void *thread_worker(void *arg) { ThreadArgs *args = (ThreadArgs *)arg; neural_network *nn = args->nn; - float **weights_gradients = args->weights_gradients; - float **bias_gradients = args->bias_gradients; - - for (size_t i = 0; i < nn->length; i++) { - size_t weights_size = nn->layer_lengths[i] * ((i == 0) ? nn->inputs_length : nn->layer_lengths[i - 1]); - weights_gradients[i] = calloc(weights_size, sizeof(float)); - bias_gradients[i] = calloc(nn->layer_lengths[i], sizeof(float)); - } + float *weights_gradients = args->weights_gradients; + float *bias_gradients = args->bias_gradients; for (size_t i = args->start_index; i < args->end_index; i++) { float *data = &args->data_batch->all_inputs[i * args->data_batch->inputs_length]; @@ -306,14 +300,16 @@ void *thread_worker(void *arg) { for (size_t j = 0; j < nn->length; j++) { size_t layer_index = nn->length - j - 1; - layer_learn_collect_gradient(nn, weights_gradients[layer_index], bias_gradients[layer_index], layer_index, data, args->data_batch->expected_indices[i]); + size_t l_sum = nn->prev_lengths_sums[layer_index]; + size_t w_sum = nn->prev_weights_sums[layer_index]; + layer_learn_collect_gradient(nn, &weights_gradients[w_sum], &bias_gradients[l_sum], layer_index, data, args->data_batch->expected_indices[i]); } } return NULL; } -#define THREAD_COUNT 4 +#define THREAD_COUNT 4 void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { assert(nn && data_batch); @@ -324,12 +320,13 @@ void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_bat ThreadArgs args[THREAD_COUNT]; size_t chunk_size = data_batch->length / THREAD_COUNT; for (int i = 0; i < THREAD_COUNT; i++) { + args[i].thread_index = i; args[i].nn = copy_neural_network(nn); args[i].data_batch = data_batch; args[i].start_index = i * chunk_size; args[i].end_index = (i == THREAD_COUNT - 1) ? data_batch->length : (i + 1) * chunk_size; - args[i].weights_gradients = malloc(nn->length * sizeof(float *)); - args[i].bias_gradients = malloc(nn->length * sizeof(float *)); + args[i].bias_gradients = calloc(nn->prev_lengths_sums[nn->length], sizeof(float)); + args[i].weights_gradients = calloc(nn->prev_weights_sums[nn->length], sizeof(float)); #ifdef USE_THREADING pthread_create(&threads[i], NULL, thread_worker, &args[i]); @@ -351,22 +348,18 @@ void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_bat size_t weights_size = nn->prev_weights_sums[i + 1] - nn->prev_weights_sums[i]; for (size_t j = 0; j < weights_size; j++) { for (int t = 0; t < THREAD_COUNT; t++) { - nn->weights[w_sum + j] -= args[t].weights_gradients[i][j] / data_batch->length * learn_rate; + nn->weights[w_sum + j] -= args[t].weights_gradients[w_sum + j] / data_batch->length * learn_rate; } } for (size_t j = 0; j < len; j++) { for (int t = 0; t < THREAD_COUNT; t++) { - nn->bias[l_sum + j] -= args[t].bias_gradients[i][j] / data_batch->length * learn_rate; + nn->bias[l_sum + j] -= args[t].bias_gradients[l_sum + j] / data_batch->length * learn_rate; } } } for (int i = 0; i < THREAD_COUNT; i++) { - for (size_t j = 0; j < nn->length; j++) { - free(args[i].weights_gradients[j]); - free(args[i].bias_gradients[j]); - } free(args[i].weights_gradients); free(args[i].bias_gradients); free(args[i].nn);