Loading... Computer Organization and Design 一书中,提到利用 cache 可以加速矩阵乘法。简单来说,就是将原本的矩阵乘法拆分为分块矩阵乘法,提高了程序的空间局部性;缓存命中率变高,在乘加操作次数总数相同的情况下运行时间就变短了。这个例子的意义在于,在通常的复杂度分析(例如 DSA 课上的复杂度分析)中,只会考虑基本算术操作的次数对程序性能的影响,而没有考虑计算机硬件组成,比如 cache ,对上层软件性能的影响。这为程序员分析软件性能提供了新的重要角度。 实验代码: ```cpp #include <cstdio> #include <iostream> #include <cstring> #include <vector> using namespace std; template <class T> class MatrixMul { const int BLK_SIZE = 32; int64_t cc_naive = 0; int64_t cc_blk = 0; void do_block(int n, int si, int sj, int sk, T* A, T* B, T* C) { for (int i = si; i < si + BLK_SIZE; ++i) { for (int j = sj; j < sj + BLK_SIZE; ++j) { T cij = C[i * n + j]; for (int k = sk; k < sk + BLK_SIZE; ++k) { cij += A[i * n + k] * B[k * n + j]; cc_blk++; } C[i * n + j] = cij; } } } public: int64_t mm_by_blks(int n, T* A, T* B, T* C) { cc_blk = 0; for (int si = 0; si < n; si += BLK_SIZE) for (int sj = 0; sj < n; sj += BLK_SIZE) for (int sk = 0; sk < n; sk += BLK_SIZE) do_block(n, si, sj, sk, A, B, C); return cc_blk; } int64_t mm_naive(int n, T* A, T* B, T* C) { cc_naive = 0; for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { T cij = C[i * n + j]; for (int k = 0; k < n; ++k) { cij += A[i * n + k] * B[k * n + j]; cc_naive++; } C[i * n + j] = cij; } } return cc_naive; } }; double get_rand_num() { return (double)rand() / RAND_MAX; } template <class T> void rand_fill(int n, T* A) { for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) A[i * n + j] = get_rand_num() * 1e6; } template <class T> void test_pref(int n) { // init matrices T* A = new T[n * n]; T* B = new T[n * n]; T* C_1 = new T[n * n]; T* C_2 = new T[n * n]; // memset(A, 0, n * n * sizeof(T)); // memset(B, 0, n * n * sizeof(T)); rand_fill(n, A); rand_fill(n, B); MatrixMul<T> mm; timespec start_1, end_1; timespec start_2, end_2; // naive way clock_gettime(CLOCK_MONOTONIC, &start_1); int64_t cc_naive = mm.mm_naive(n, A, B, C_1); clock_gettime(CLOCK_MONOTONIC, &end_1); double deltaT_1 = (end_1.tv_sec - start_1.tv_sec) * 1e3 + (end_1.tv_nsec - start_1.tv_nsec) * 1e-6; // unit: ms // by blks way clock_gettime(CLOCK_MONOTONIC, &start_2); int64_t cc_blk = mm.mm_by_blks(n, A, B, C_2); clock_gettime(CLOCK_MONOTONIC, &end_2); double deltaT_2 = (end_2.tv_sec - start_2.tv_sec) * 1e3 + (end_2.tv_nsec - start_2.tv_nsec) * 1e-6; // unit: ms // verity the consistency int cons = memcmp(C_1, C_2, n * n * sizeof(T)); if (cons != 0) { cerr << "\nVerification Error: inconsistent result!" << endl; for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) if (C_1[i] != C_2[i]) { cerr << "C_1[" << i << "][" << j << "] = " << C_1[i * n + j] << ", C_2[" << i << "][" << j << "] = " << C_2[i * n + j] << endl; } exit(cons); } else { // cout << "Verification Succeed: consistent result!" << endl; } // print results printf("\t%5d\t\t%8f\t\t%8f\t\t%12lld\t\t%12lld\n", n, deltaT_1, deltaT_2, cc_naive, cc_blk); } int main() { srand(time(0)); printf("\tN\t\tNaive way (ms)\t\tBlock way (ms)\t\tNaive CC\t\tBLK CC\n"); vector<int> n_list = {32, 160, 480, 960}; for (auto n : n_list) test_pref<int>(n); return 0; } ``` 实验结果: | N | Naïve way (ms) | Block way (ms) | | :----: | :-------------: | :------------: | | | | | | double | O0 | | | 32 | 0.115 | 0.138 | | 160 | 15.307 | 15.755 | | 480 | 382.324 | 310.271 | | 960 | 4240.496 | 2641.854 | | | | | | double | O3 | | | 32 | 0.031 | 0.016 | | 160 | 5.172 | 1.537 | | 480 | 120.444 | 28.457 | | 960 | 1106.887 | 243.577 | | | | | | int | O0 | | | 32 | 0.088 | 0.104 | | 160 | 11.182 | 13.035 | | 480 | 333.032 | 309.854 | | 960 | 3626.819 | 2647.848 | | | | | | int | O3 | | | 32 | 0.019 | 0.008 | | 160 | 2.208 | 0.66 | | 480 | 98.404 | 12.198 | | 960 | 801.031 | 99.822 | 可以看到,在 O3 优化下,分块矩阵乘法的时间仅为普通版本的 **22%** (双精度浮点乘法)、 **12%** (32 位整型乘法)。 Last modification:November 25th, 2021 at 10:35 pm © 允许规范转载 Support 如果觉得我的文章对你有用,请随意赞赏 Appreciate the author