1024x1024普通矩阵乘cuda函数记录

本文最后更新于41 天前，其中的信息可能已经过时，如有错误请发送邮件到zhangweihao22@outlook.com

速度提升

使用time.h记录函数运行时间

经过测试，
普通矩阵乘的c语言实现时间为：
Time taken: 3.816000 seconds
源码：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SIZE 1024
#define TOTAL_SIZE (SIZE * SIZE)

// 函数声明
void generateRandomMatrix(int *matrix);
void printMatrix(int *matrix, int rows, int cols);
void matrixMultiplication(int *A, int *B, int *C);

int main() {
&nbsp; &nbsp; clock_t start, end;
&nbsp; &nbsp; double cpu_time_used;
&nbsp; &nbsp; start = clock();

&nbsp; &nbsp; int *A_h = (int *)malloc(TOTAL_SIZE * sizeof(int));
&nbsp; &nbsp; int *B_h = (int *)malloc(TOTAL_SIZE * sizeof(int));
&nbsp; &nbsp; int *C_h = (int *)malloc(TOTAL_SIZE * sizeof(int));

&nbsp; &nbsp; // 生成随机矩阵A和B
&nbsp; &nbsp; generateRandomMatrix(A_h);
&nbsp; &nbsp; generateRandomMatrix(B_h);

&nbsp; &nbsp; // 计算矩阵乘积C = A * B
&nbsp; &nbsp; matrixMultiplication(A_h, B_h, C_h);

&nbsp; &nbsp; // 释放内存
&nbsp; &nbsp; free(A_h);
&nbsp; &nbsp; free(B_h);
&nbsp; &nbsp; free(C_h);

&nbsp; &nbsp; end = clock();
&nbsp; &nbsp; cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
&nbsp; &nbsp; printf("Time taken: %f seconds\n", cpu_time_used);
&nbsp; &nbsp; return 0;
}

// 生成随机矩阵
void generateRandomMatrix(int *matrix) {
&nbsp; &nbsp; for (int i = 0; i < TOTAL_SIZE; i++) {
&nbsp; &nbsp; &nbsp; &nbsp; matrix[i] = rand() % 10; // 生成0到9之间的随机数
&nbsp; &nbsp; }
}

// 矩阵乘法
void matrixMultiplication(int *A, int *B, int *C) {
&nbsp; &nbsp; for (int i = 0; i < SIZE; i++) {
&nbsp; &nbsp; &nbsp; &nbsp; for (int j = 0; j < SIZE; j++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; C[i * SIZE + j] = 0;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for (int k = 0; k < SIZE; k++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; C[i * SIZE + j] += A[i * SIZE + k] * B[k * SIZE + j];
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; }
}

cuda简单线程分配实现时间为：
Time taken: 3.096000 seconds
源码：

// 简单矩阵乘运算

#include<iostream>
#include<cuda_runtime.h>

#define M 1024
#define K 1024
#define N 1024

#define SIZE 1024
#define TOTAL_SIZE (SIZE * SIZE)

using namespace std;

__global__ void matrix_mult_kernel (int *A, int *B, int *C, int Mm ,int Kk, int Nn) {
&nbsp; &nbsp; // M为行数目、N为列数，K为中间的维度量
&nbsp; &nbsp; int row = blockIdx.y * blockDim.y + threadIdx.y; // 加载行索引
&nbsp; &nbsp; int col = blockIdx.x * blockDim.x + threadIdx.x; // 加载列索引
&nbsp; &nbsp; if(row < Mm && col < Nn) {
&nbsp; &nbsp; &nbsp; &nbsp; int value = 0.0;

&nbsp; &nbsp; &nbsp; &nbsp; // 为什么下面这里循环的次数为k？
&nbsp; &nbsp; &nbsp; &nbsp; for(int i = 0 ; i < Kk ; i ++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; value += A[row * Kk + i] * B[i * Nn + col]; // 一个线程计算得出对应C矩阵的一个点(row , col)
&nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; // 一维线性储存数据
&nbsp; &nbsp; &nbsp; &nbsp; // C数组的行宽为N，列长为M
&nbsp; &nbsp; &nbsp; &nbsp; C[row * Nn + col] = value;
&nbsp; &nbsp; }
}

// 生成随机矩阵
void generateRandomMatrix(int *matrix) {
&nbsp; &nbsp; for (int i = 0; i < TOTAL_SIZE; i++) {
&nbsp; &nbsp; &nbsp; &nbsp; matrix[i] = rand() % 10; // 生成0到9之间的随机数
&nbsp; &nbsp; }
}

// 矩阵乘法
void matrixMultiplication(int *A, int *B, int *C) {
&nbsp; &nbsp; for (int i = 0; i < SIZE; i++) {
&nbsp; &nbsp; &nbsp; &nbsp; for (int j = 0; j < SIZE; j++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; C[i * SIZE + j] = 0;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for (int k = 0; k < SIZE; k++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; C[i * SIZE + j] += A[i * SIZE + k] * B[k * SIZE + j];
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; }
}


int main ()
{
&nbsp; &nbsp; clock_t start, end;
&nbsp; &nbsp; double cpu_time_used;
&nbsp; &nbsp; start = clock();


&nbsp; &nbsp; // int M = 3, K = 3, N = 3; // 矩阵均为为 3x3 和 3x3
&nbsp; &nbsp; // 创建并初始化矩阵 A 和 B
&nbsp; &nbsp; int *A_h = (int *)malloc(TOTAL_SIZE * sizeof(int));
&nbsp; &nbsp; int *B_h = (int *)malloc(TOTAL_SIZE * sizeof(int));
&nbsp; &nbsp; int *C_h = (int *)malloc(TOTAL_SIZE * sizeof(int));
&nbsp; &nbsp; // 生成随机矩阵A和B
&nbsp; &nbsp; generateRandomMatrix(A_h);
&nbsp; &nbsp; generateRandomMatrix(B_h);

&nbsp; &nbsp; // 计算矩阵乘积C = A * B
&nbsp; &nbsp; matrixMultiplication(A_h, B_h, C_h);

&nbsp; &nbsp; // device端变量的声明
&nbsp; &nbsp; int *A_d;
&nbsp; &nbsp; int *B_d;
&nbsp; &nbsp; int *C_d;

&nbsp; &nbsp; // 空间的开辟

&nbsp; &nbsp; cudaMalloc(&A_d,M*K*sizeof(int));
&nbsp; &nbsp; cudaMalloc(&B_d,K*N*sizeof(int));
&nbsp; &nbsp; cudaMalloc(&C_d,M*N*sizeof(int));

&nbsp; &nbsp; // 数据的装载
&nbsp; &nbsp; cudaMemcpy(A_d,A_h,M*K*sizeof(int),cudaMemcpyHostToDevice);
&nbsp; &nbsp; cudaMemcpy(B_d,B_h,K*N*sizeof(int),cudaMemcpyHostToDevice);

&nbsp; &nbsp; // 设置线程块、线程数目
&nbsp; &nbsp; dim3 blockSize(16,16);
&nbsp; &nbsp; // 向上取整得到grid维度数据
&nbsp; &nbsp; dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (M + blockSize.y - 1) / blockSize.y);

&nbsp; &nbsp; matrix_mult_kernel<<<gridSize,blockSize>>>(A_d,B_d,C_d,M,K,N);

&nbsp; &nbsp; // 检测代码
&nbsp; &nbsp; cudaError_t err = cudaGetLastError();
&nbsp; &nbsp; if (err != cudaSuccess) {
&nbsp; &nbsp; &nbsp; &nbsp; std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
&nbsp; &nbsp; &nbsp; &nbsp; return -1;
&nbsp; &nbsp; }

&nbsp; &nbsp; // 同步
&nbsp; &nbsp; cudaDeviceSynchronize();

&nbsp; &nbsp; // 拷贝结果
&nbsp; &nbsp; cudaMemcpy(C_h,C_d,M*N*sizeof(int),cudaMemcpyDeviceToHost);
&nbsp; &nbsp; // 释放内存
&nbsp; &nbsp; cudaFree(A_d);
&nbsp; &nbsp; cudaFree(B_d);
&nbsp; &nbsp; cudaFree(C_d);

&nbsp; &nbsp; end = clock();
&nbsp; &nbsp; cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
&nbsp; &nbsp; printf("Time taken: %f seconds\n", cpu_time_used);

&nbsp; &nbsp; return 0;
}

Post Views: 3

速度提升

发送评论 编辑评论

推荐文章

发送评论编辑评论