VivadoHLS实现的矩阵分块乘

10月 23日, 2019 LinearAlgebra VivadoHLS


Vivado HLS

官方文档: (UG902)Vivado Design Suite Guide: High-Level Synthesis

使用HLS需要做什么

使用HLS能得到什么

优化指令/P16/

矩阵分块乘法的例子

头文件:

#ifndef _BLOCK_MM_H_
#define _BLOCK_MM_H_
#include "hls_stream.h"
#include <iostream>
#include <iomanip>
#include <vector>
using namespace std;

typedef int DTYPE;
const int SIZE = 16;
const int BLOCK_SIZE = 4;

typedef struct { DTYPE a[BLOCK_SIZE]; } blockvec;

typedef struct { DTYPE out[BLOCK_SIZE][BLOCK_SIZE]; } blockmat;

void blockmatmul(hls::stream<blockvec> &Arows, hls::stream<blockvec> &Bcols,
                                 blockmat & ABpartial, DTYPE iteration);
#endif

分块乘:

#include "MatrixMul.h"

void blockmatmul(hls::stream<blockvec> &Arows, hls::stream<blockvec> &Bcols,
        blockmat &ABpartial, int it) {
  #pragma HLS DATAFLOW
  int counter = it % (SIZE/BLOCK_SIZE);
  static DTYPE A[BLOCK_SIZE][SIZE];
  if(counter == 0){ //only load the A rows when necessary
    loadA: for(int i = 0; i < SIZE; i++) {
      blockvec tempA = Arows.read();
      for(int j = 0; j < BLOCK_SIZE; j++) {
        #pragma HLS PIPELINE II=1
        A[j][i] = tempA.a[j];
      }
    }
  }
  DTYPE AB[BLOCK_SIZE][BLOCK_SIZE] = { 0 };
  partialsum: for(int k=0; k < SIZE; k++) {
    blockvec tempB = Bcols.read();
    for(int i = 0; i < BLOCK_SIZE; i++) {
      for(int j = 0; j < BLOCK_SIZE; j++) {
        AB[i][j] = AB[i][j] +  A[i][k] * tempB.a[j];
      }
    }
  }
  writeoutput: for(int i = 0; i < BLOCK_SIZE; i++) {
    for(int j = 0; j < BLOCK_SIZE; j++) {
      ABpartial.out[i][j] = AB[i][j];
    }
  }
}

TestBench:

#include "MatrixMul.h"
#include <stdlib.h>
using namespace std;

void matmatmul_sw(DTYPE A[SIZE][SIZE], DTYPE B[SIZE][SIZE],
      DTYPE out[SIZE][SIZE]){
 DTYPE sum = 0;
 for(int i = 0; i < SIZE; i++){
  for(int j = 0;j<SIZE; j++){
   sum = 0;
   for(int k = 0; k < SIZE; k++){
    sum = sum + A[i][k] * B[k][j];
   }
   out[i][j] = sum;
  }
 }
}

int main() {
 int fail = 0;
 hls::stream<blockvec> strm_matrix1("strm_matrix1");
 hls::stream<blockvec> strm_matrix2("strm_matrix2");
 blockvec strm_matrix1_element, strm_matrix2_element;
 blockmat block_out;
 DTYPE A[SIZE][SIZE], B[SIZE][SIZE];
 DTYPE matrix_swout[SIZE][SIZE], matrix_hwout[SIZE][SIZE];

 initmatrices: for(int i = 0; i < SIZE; i++){
  for(int j = 0; j < SIZE; j++){
   A[i][j] = rand() % 512;
   B[i][j] = rand() % 512;
   matrix_swout[i][j] = 0;
   matrix_hwout[i][j] = 0;
  }
 }
 int row, col, it = 0;
 for(int it1 = 0; it1 < SIZE; it1 = it1 + BLOCK_SIZE) {
   for(int it2 = 0; it2 < SIZE; it2 = it2 + BLOCK_SIZE) {
     row = it1; //row + BLOCK_SIZE * factor_row;
     col = it2; //col + BLOCK_SIZE * factor_col;

     for(int k = 0; k < SIZE; k++) {
       for(int i = 0; i < BLOCK_SIZE; i++) {
         if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1_element.a[i] = A[row+i][k];
         strm_matrix2_element.a[i] = B[k][col+i];
       }
       if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1.write(strm_matrix1_element);
       strm_matrix2.write(strm_matrix2_element);
     }
     blockmatmul(strm_matrix1, strm_matrix2, block_out, it);

     for(int i = 0; i < BLOCK_SIZE; i++)
       for(int j = 0; j < BLOCK_SIZE; j++)
         matrix_hwout[row+i][col+j] = block_out.out[i][j];
     it = it + 1;
   }
 }

 matmatmul_sw(A, B, matrix_swout);

 for(int i = 0; i<SIZE; i++)
   for(int j = 0; j<SIZE; j++)
     if(matrix_swout[i][j] != matrix_hwout[i][j]) { fail=1; }

 if(fail==1) cout << "failed" << endl;
 else cout << "passed" << endl;

 return 0;
}

继续的工作