使用Vitis HLS实现和优化Conv2D函数
现代卷积神经网络(CNNs)中,卷积操作(Conv2D)是最基本且计算量集中的部分。为了在嵌入式系统和FPGA平台上加速这一计算,我们可以利用Xilinx的Vitis高层次综合(HLS)工具。本文将介绍如何使用Vitis HLS实现一个基础的卷积操作,随后进行各种优化以提高其性能。
一、基础实现
我们首先实现一个简单的Conv2D函数。以下是定义和实现Conv2D函数的代码:
1.1 Conv2D函数定义 (conv2d.h)
#ifndef CONV2D_H #define CONV2D_H #include "ap_int.h" #define INPUT_SIZE 32 #define KERNEL_SIZE 3 #define STRIDE 1 #define OUTPUT_SIZE ((INPUT_SIZE - KERNEL_SIZE) / STRIDE + 1) typedef ap_uint<8> pixel_int; void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]); #endif
1.2 Conv2D函数实现 (conv2d.cpp)
#include "conv2d.h"
void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) {
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
int sum = 0;
for (int m = 0; m < KERNEL_SIZE; m++) {
for (int n = 0; n < KERNEL_SIZE; n++) {
sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n];
}
}
output[i][j] = sum;
}
}
}1.3 测试代码 (test_conv2d.cpp)
为了验证Conv2D函数的正确性和有效性,我们实现了如下的测试代码:
#include <iostream>
#include "conv2d.h"
void print_output(pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) {
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
std::cout << std::hex << static_cast<int>(output[i][j]) << " ";
}
std::cout << std::endl;
}
}
int main() {
pixel_int input[INPUT_SIZE][INPUT_SIZE];
pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE];
pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE];
// 初始化输入数据和kernel
for (int i = 0; i < INPUT_SIZE; i++) {
for (int j = 0; j < INPUT_SIZE; j++) {
input[i][j] = i % 256; // 使用模操作,防止溢出
}
}
for (int i = 0; i < KERNEL_SIZE; i++) {
for (int j = 0; j < KERNEL_SIZE; j++) {
kernel[i][j] = 1;
}
}
// 执行2D卷积操作
conv2d(input, kernel, output);
// 打印输出数据
print_output(output);
return 0;
}还有自动比对版本:
#include <iostream>
#include <cstdlib>
#include "../prj/conv2d.h"
using namespace std;
void print_output(pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) {
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
cout << output[i][j] << " ";
}
cout << endl;
}
}
// 计算期望结果(软件模拟)
void calculate_expected_output(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int expected_output[OUTPUT_SIZE][OUTPUT_SIZE]) {
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
int sum = 0;
for (int m = 0; m < KERNEL_SIZE; m++) {
for (int n = 0; n < KERNEL_SIZE; n++) {
sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n];
}
}
expected_output[i][j] = sum;
}
}
}
// 自动化测试功能
bool run_test(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE]) {
pixel_int hardware_output[OUTPUT_SIZE][OUTPUT_SIZE];
pixel_int expected_output[OUTPUT_SIZE][OUTPUT_SIZE];
// 计算软件期望结果
calculate_expected_output(input, kernel, expected_output);
// 执行硬件加速函数
conv2d(input, kernel, hardware_output);
// 比较硬件结果和期望结果
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
if (hardware_output[i][j] != expected_output[i][j]) {
// 打印出错的输入和输出
cout << "Error at position (" << i << "," << j << "). Expected: " << expected_output[i][j] << ", Got: " << hardware_output[i][j] << endl;
cout << "Input:" << endl;
for (int ki = 0; ki < INPUT_SIZE; ki++) {
for (int kj = 0; kj < INPUT_SIZE; kj++) {
cout << input[ki][kj] << " ";
}
cout << endl;
}
cout << "Kernel:" << endl;
for (int ki = 0; ki < KERNEL_SIZE; ki++) {
for (int kj = 0; kj < KERNEL_SIZE; kj++) {
cout << kernel[ki][kj] << " ";
}
cout << endl;
}
cout << "Hardware Output:" << endl;
print_output(hardware_output);
cout << "Expected Output:" << endl;
print_output(expected_output);
return false;
}
}
}
return true;
}
int main() {
// 生成随机测试数据
pixel_int input[INPUT_SIZE][INPUT_SIZE];
pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE];
// 运行多次测试
const int num_tests = 10; // 测试次数
bool all_tests_passed = true;
for (int test = 0; test < num_tests; test++) {
// 随机生成输入数据和卷积核
for (int i = 0; i < INPUT_SIZE; i++) {
for (int j = 0; j < INPUT_SIZE; j++) {
input[i][j] = rand() % 256; // 随机值在0到255之间
}
}
for (int i = 0; i < KERNEL_SIZE; i++) {
for (int j = 0; j < KERNEL_SIZE; j++) {
kernel[i][j] = rand() % 3 - 1; // 随机值在-1到1之间
}
}
// 运行测试
if (!run_test(input, kernel)) {
all_tests_passed = false;
cout << "Test " << test << " failed!" << endl;
} else {
cout << "Test " << test << " passed!" << endl;
}
}
if (all_tests_passed) {
cout << "All tests passed!" << endl;
} else {
cout << "Some tests failed!" << endl;
}
return all_tests_passed ? 0 : 1;
}通过这些代码,我们可以在Vitis HLS中实现一个基础的Conv2D操作,并进行仿真和验证。接下来,我们将对该函数进行优化,以提高其性能。


二、优化实现
为了提高Conv2D函数的性能,我们引入多种优化技术,包括循环展开、流水线和数据流优化。
2.1 优化后的Conv2D函数实现
在优化过程中,我们使用了以下HLS指令:
#pragma HLS UNROLL用于展开循环,提高并行度。#pragma HLS PIPELINE用于创建流水线,提高执行速度。#pragma HLS ARRAY_PARTITION用于划分数组,最大化并行内存访问。
以下是优化后的代码:
#include "conv2d.h"
void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) {
// 将输入缓冲存入BRAM以进行快速访问的操作
#pragma HLS ARRAY_PARTITION variable=input complete dim=2
#pragma HLS ARRAY_PARTITION variable=kernel complete dim=2
#pragma HLS ARRAY_PARTITION variable=output complete dim=2
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
#pragma HLS PIPELINE
int sum = 0;
for (int m = 0; m < KERNEL_SIZE; m++) {
#pragma HLS UNROLL
for (int n = 0; n < KERNEL_SIZE; n++) {
#pragma HLS UNROLL
sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n];
}
}
output[i][j] = sum;
}
}
}然后可以初步修改:
#include "conv2d.h"
void conv2d(pixel_int input_stream[INPUT_SIZE * INPUT_SIZE], pixel_int kernel_stream[KERNEL_SIZE * KERNEL_SIZE], pixel_int output_stream[OUTPUT_SIZE * OUTPUT_SIZE]) {
#pragma HLS INTERFACE axis port=input_stream
#pragma HLS INTERFACE axis port=kernel_stream
#pragma HLS INTERFACE axis port=output_stream
#pragma HLS INTERFACE s_axilite port=return
pixel_int input[INPUT_SIZE][INPUT_SIZE];
pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE];
pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE];
// 将输入流转换为本地数组
for (int i = 0; i < INPUT_SIZE; i++) {
for (int j = 0; j < INPUT_SIZE; j++) {
#pragma HLS PIPELINE II=1
input[i][j] = input_stream[i * INPUT_SIZE + j];
}
}
// 将卷积核流转换为本地数组
for (int i = 0; i < KERNEL_SIZE; i++) {
for (int j = 0; j < KERNEL_SIZE; j++) {
#pragma HLS PIPELINE II=1
kernel[i][j] = kernel_stream[i * KERNEL_SIZE + j];
}
}
// 主卷积操作
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
#pragma HLS PIPELINE II=1
int sum = 0;
for (int m = 0; m < KERNEL_SIZE; m++) {
for (int n = 0; n < KERNEL_SIZE; n++) {
#pragma HLS UNROLL
sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n];
}
}
output[i][j] = sum;
}
}
// 输出结果流
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
#pragma HLS PIPELINE II=1
output_stream[i * OUTPUT_SIZE + j] = output[i][j];
}
}
}2.2 优化效果
通过应用上述优化技术,Conv2D函数的计算性能显著提高。具体的优化效果可以通过下述步骤进行验证:
创建一个新的Vitis HLS工程,将优化后的代码复制进去。
构建工程并设置优化后的Conv2D函数为顶层函数。
综合和评价,查看优化对资源利用率和性能的影响。
使用上述优化,我们可以在FPGA平台上加速Conv2D操作,从而提高整个卷积神经网络模型的推理速度。
分析串行代码的时钟周期数
1. 主卷积操作
外层循环
i:迭代次数:
OUTPUT_SIZE次外层循环
j:迭代次数:
OUTPUT_SIZE次内层循环
m:迭代次数:
KERNEL_SIZE次内层循环
n:迭代次数:
KERNEL_SIZE次
在内层循环 m 和 n 中,每次迭代进行一次乘法和累加操作,需要一个时钟周期来完成。
所以,总体的时钟周期数为:
时钟周期总数 = OUTPUT_SIZE * OUTPUT_SIZE * KERNEL_SIZE * KERNEL_SIZE
2. 其他步骤
在串行计算中,未包含输入和输出流的操作,因为在 FPGA 硬件中这些操作也是串行化的,我们忽略此部分的性能损耗。
代入实际值计算时钟周期数
假设 INPUT_SIZE = 32,KERNEL_SIZE = 3,STRIDE = 1,那么 OUTPUT_SIZE 可以通过以下公式计算:
OUTPUT_SIZE = (INPUT_SIZE - KERNEL_SIZE) / STRIDE + 1 = 30
代入实际的值,我们计算总时钟周期数:
时钟周期总数 = OUTPUT_SIZE * OUTPUT_SIZE * KERNEL_SIZE * KERNEL_SIZE = 30 * 30 * 3 * 3 = 30 * 30 * 9 = 8100
FPGA初步优化后的时钟周期:

进一步优化之后:
#ifndef CONV2D_H #define CONV2D_H #include "ap_int.h" #define INPUT_SIZE 32 #define KERNEL_SIZE 3 #define STRIDE 1 #define OUTPUT_SIZE ((INPUT_SIZE - KERNEL_SIZE) / STRIDE + 1) typedef ap_uint<8> pixel_int; void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]); #endif
#include "conv2d.h"
void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) {
// 使用BRAM存储
#pragma HLS ARRAY_PARTITION variable=input complete dim=2
#pragma HLS ARRAY_PARTITION variable=kernel complete dim=2
#pragma HLS ARRAY_PARTITION variable=output complete dim=2
// 优化流水线
#pragma HLS INLINE off
for (int i = 0; i < OUTPUT_SIZE; i++) {
for (int j = 0; j < OUTPUT_SIZE; j++) {
#pragma HLS PIPELINE II=1
int sum = 0;
for (int m = 0; m < KERNEL_SIZE; m++) {
#pragma HLS UNROLL factor=3
for (int n = 0; n < KERNEL_SIZE; n++) {
#pragma HLS UNROLL factor=3
sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n];
}
}
output[i][j] = sum;
}
}
}
好的,已经初见效果了!
本程序还可以进一步优化,我们将在后面统一发布的完整版本中进行更新。




