Multi-threaded Matrix Multiplication

in this post you’ll find the implementation of iterative matrix multiplication using posix threads.

you can find the complete code with some utility methods on github

Approach 1 (multiply1) : The computation of each element of the output matrix happens in a thread.

struct cell{int i,j;};
void *mulcell(void* param){
	//	passing a struct to overcome the limitation of passing one argument
	struct cell *cel = (struct cell*)param;
	int k, i = cel->i, j = cel->j;

	for (k = 0; k < p; ++k){
		c[i][j] += a[i][k] * b[k][j];

	return NULL;//will never execute
void multiply1(){
	// The computation of each element of the output matrix happens in a thread.
	int i,j,r,t;

	//dynamically allocate #cells threads
	pthread_t** threads = (pthread_t**) malloc(sizeof(pthread_t*)*m*n);
	struct cell** cells = (struct cell**) malloc(sizeof(struct cell*)*m*n);

	for (i = 0; i < m; ++i) {
		for (j = 0 ; j < n ; j ++){
			t = i*n + j;
			threads[t] =(pthread_t*) malloc(sizeof(pthread_t));
			cells[t] = malloc(sizeof(struct cell));
			cells[t]->i = i ; cells[t]->j = j;
			r = pthread_create(threads[t], NULL, mulcell, (void*) cells[t]);
			if (r<0)
				printf("failed to create thread at mul1 at (i,j) %d,%d\n",i,j);

	//wait for the threads to finish
	for (i = 0; i < m*n; ++i){
//		 no need to free threads[i] as its already freed by pthread_exit();
	//free allocated memory

Approach 2 (multiply2): The  computation of each row of the output matrix happens in a thread.

void *mulrow(void* row){
	int j, i = *((int*)row) ,k;

	for (j = 0 ; j < n ; j ++)
		for (k = 0 ; k < p ; k++)
			c[i][j] += a[i][k] * b[k][j];
	return NULL;// anyway this line won't be processes yet it's added to prevent eclipse from showing warnings
void multiply2(){
	//The computation of each row of the output matrix happens in a thread.
	int i,r;

	//dynamically allocate #rows threads
	pthread_t** threads = (pthread_t**) malloc(sizeof(pthread_t*)*m);
	int* rows = (int*) malloc(sizeof(int)*m);

	for (i = 0; i < m; ++i) {
		threads[i] =(pthread_t*) malloc(sizeof(pthread_t));
		rows[i] = i;
		r = pthread_create(threads[i], NULL, mulrow, (void*) &rows[i]);
		if (r<0)
			printf("failed to create thread at mul2 at i =  %d\n",i);

	//wait for the threads to finish
	for (i = 0; i < m; ++i)

	//free allocated memory

here is a benchmark method which i’ve used to calculate the average running time.

long long benchmark(int nElements, int itrNum , void(*myfn)()){
	n = m = p = nElements;
	a = (int**) malloc(sizeof(int*)*nElements);
	b = (int**) malloc(sizeof(int*)*nElements);
	c = (int**) malloc(sizeof(int*)*nElements);

	int i,j,k;
	long long averagetime =0 ,t1,t2;
	for (i = 0; i < nElements; ++i){
		a[i] = (int*) malloc(sizeof(int)*nElements);
		b[i] = (int*) malloc(sizeof(int)*nElements);
		c[i] = (int*) malloc(sizeof(int)*nElements);
	for (k = 0; k < itrNum; ++k) {
		for (i = 0; i < nElements; ++i)
			for (j = 0; j < nElements; ++j)
				a[i][j] =rand()%1001 , b[i][j] =rand()%1001;

		t1 = curTime();
//		multiply1();
		t2 = curTime();
		averagetime += (t2-t1);
	printf("benchmark using a[%d][%d] * b[%d][%d]: average on %d samples = %lld usec\n",
	return averagetime;

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s