VivadoHLS实现的矩阵分块乘
10月 23日, 2019
LinearAlgebra
VivadoHLS
Vivado HLS
官方文档: (UG902)Vivado Design Suite Guide: High-Level Synthesis
- c++编写算法/可读性
- c++层面的算法验证
- 使用优化指令控制c综合过程/P6/pragma/
- 可以生成Vivado IP核或是SystemGenerator
使用HLS需要做什么
- C,C++或者SystemC 实现的函数
- 约束,包括时钟周期、时钟不确定度、目标硬件
- 指令,在综合的过程中用于特殊的行为和优化
- TestBench,算法层面的验证
使用HLS能得到什么
- HDL形式的RTL输出,并打包成IP核
- 报告文件
优化指令/P16/
- 指定为Pipeline,在当前任务结束前开始执行下次任务
- 指定延迟
- 约束资源占用
- 覆盖依赖关系,允许特殊操作,比如初始数据,内存在写之前读
- 选择IO协议
矩阵分块乘法的例子
- 只能对一个顶层的函数进行综合,这里是
blockmatmul
- TestBench的函数名是
main
- HLS支持原生所有的数据类型,任意精度数据类型,不支持动态内存分配/P14/
- 官方的库质量很高,“高性能设计和优化的资源占用”/P15/hls::stream/
头文件:
#ifndef _BLOCK_MM_H_
#define _BLOCK_MM_H_
#include "hls_stream.h"
#include <iostream>
#include <iomanip>
#include <vector>
using namespace std;
typedef int DTYPE;
const int SIZE = 16;
const int BLOCK_SIZE = 4;
typedef struct { DTYPE a[BLOCK_SIZE]; } blockvec;
typedef struct { DTYPE out[BLOCK_SIZE][BLOCK_SIZE]; } blockmat;
void blockmatmul(hls::stream<blockvec> &Arows, hls::stream<blockvec> &Bcols,
blockmat & ABpartial, DTYPE iteration);
#endif
分块乘:
#include "MatrixMul.h"
void blockmatmul(hls::stream<blockvec> &Arows, hls::stream<blockvec> &Bcols,
blockmat &ABpartial, int it) {
#pragma HLS DATAFLOW
int counter = it % (SIZE/BLOCK_SIZE);
static DTYPE A[BLOCK_SIZE][SIZE];
if(counter == 0){ //only load the A rows when necessary
loadA: for(int i = 0; i < SIZE; i++) {
blockvec tempA = Arows.read();
for(int j = 0; j < BLOCK_SIZE; j++) {
#pragma HLS PIPELINE II=1
A[j][i] = tempA.a[j];
}
}
}
DTYPE AB[BLOCK_SIZE][BLOCK_SIZE] = { 0 };
partialsum: for(int k=0; k < SIZE; k++) {
blockvec tempB = Bcols.read();
for(int i = 0; i < BLOCK_SIZE; i++) {
for(int j = 0; j < BLOCK_SIZE; j++) {
AB[i][j] = AB[i][j] + A[i][k] * tempB.a[j];
}
}
}
writeoutput: for(int i = 0; i < BLOCK_SIZE; i++) {
for(int j = 0; j < BLOCK_SIZE; j++) {
ABpartial.out[i][j] = AB[i][j];
}
}
}
TestBench:
#include "MatrixMul.h"
#include <stdlib.h>
using namespace std;
void matmatmul_sw(DTYPE A[SIZE][SIZE], DTYPE B[SIZE][SIZE],
DTYPE out[SIZE][SIZE]){
DTYPE sum = 0;
for(int i = 0; i < SIZE; i++){
for(int j = 0;j<SIZE; j++){
sum = 0;
for(int k = 0; k < SIZE; k++){
sum = sum + A[i][k] * B[k][j];
}
out[i][j] = sum;
}
}
}
int main() {
int fail = 0;
hls::stream<blockvec> strm_matrix1("strm_matrix1");
hls::stream<blockvec> strm_matrix2("strm_matrix2");
blockvec strm_matrix1_element, strm_matrix2_element;
blockmat block_out;
DTYPE A[SIZE][SIZE], B[SIZE][SIZE];
DTYPE matrix_swout[SIZE][SIZE], matrix_hwout[SIZE][SIZE];
initmatrices: for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
A[i][j] = rand() % 512;
B[i][j] = rand() % 512;
matrix_swout[i][j] = 0;
matrix_hwout[i][j] = 0;
}
}
int row, col, it = 0;
for(int it1 = 0; it1 < SIZE; it1 = it1 + BLOCK_SIZE) {
for(int it2 = 0; it2 < SIZE; it2 = it2 + BLOCK_SIZE) {
row = it1; //row + BLOCK_SIZE * factor_row;
col = it2; //col + BLOCK_SIZE * factor_col;
for(int k = 0; k < SIZE; k++) {
for(int i = 0; i < BLOCK_SIZE; i++) {
if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1_element.a[i] = A[row+i][k];
strm_matrix2_element.a[i] = B[k][col+i];
}
if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1.write(strm_matrix1_element);
strm_matrix2.write(strm_matrix2_element);
}
blockmatmul(strm_matrix1, strm_matrix2, block_out, it);
for(int i = 0; i < BLOCK_SIZE; i++)
for(int j = 0; j < BLOCK_SIZE; j++)
matrix_hwout[row+i][col+j] = block_out.out[i][j];
it = it + 1;
}
}
matmatmul_sw(A, B, matrix_swout);
for(int i = 0; i<SIZE; i++)
for(int j = 0; j<SIZE; j++)
if(matrix_swout[i][j] != matrix_hwout[i][j]) { fail=1; }
if(fail==1) cout << "failed" << endl;
else cout << "passed" << endl;
return 0;
}
继续的工作
- 矩阵数据类型改为定点数,需要学习HLS提供的库/P201/
- 修改矩阵输入数据的顺序
- 任意形状以及不能整除的分块
- HLS提供的线性代数库
- 性能比较
- 读文档