使用Vitis HLS实现和优化Conv2D函数
现代卷积神经网络(CNNs)中,卷积操作(Conv2D)是最基本且计算量集中的部分。为了在嵌入式系统和FPGA平台上加速这一计算,我们可以利用Xilinx的Vitis高层次综合(HLS)工具。本文将介绍如何使用Vitis HLS实现一个基础的卷积操作,随后进行各种优化以提高其性能。
一、基础实现
我们首先实现一个简单的Conv2D函数。以下是定义和实现Conv2D函数的代码:
1.1 Conv2D函数定义 (conv2d.h)
#ifndef CONV2D_H #define CONV2D_H #include "ap_int.h" #define INPUT_SIZE 32 #define KERNEL_SIZE 3 #define STRIDE 1 #define OUTPUT_SIZE ((INPUT_SIZE - KERNEL_SIZE) / STRIDE + 1) typedef ap_uint<8> pixel_int; void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]); #endif
1.2 Conv2D函数实现 (conv2d.cpp)
#include "conv2d.h" void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) { for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { int sum = 0; for (int m = 0; m < KERNEL_SIZE; m++) { for (int n = 0; n < KERNEL_SIZE; n++) { sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n]; } } output[i][j] = sum; } } }
1.3 测试代码 (test_conv2d.cpp)
为了验证Conv2D函数的正确性和有效性,我们实现了如下的测试代码:
#include <iostream> #include "conv2d.h" void print_output(pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) { for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { std::cout << std::hex << static_cast<int>(output[i][j]) << " "; } std::cout << std::endl; } } int main() { pixel_int input[INPUT_SIZE][INPUT_SIZE]; pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE]; pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]; // 初始化输入数据和kernel for (int i = 0; i < INPUT_SIZE; i++) { for (int j = 0; j < INPUT_SIZE; j++) { input[i][j] = i % 256; // 使用模操作,防止溢出 } } for (int i = 0; i < KERNEL_SIZE; i++) { for (int j = 0; j < KERNEL_SIZE; j++) { kernel[i][j] = 1; } } // 执行2D卷积操作 conv2d(input, kernel, output); // 打印输出数据 print_output(output); return 0; }
还有自动比对版本:
#include <iostream> #include <cstdlib> #include "../prj/conv2d.h" using namespace std; void print_output(pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) { for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { cout << output[i][j] << " "; } cout << endl; } } // 计算期望结果(软件模拟) void calculate_expected_output(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int expected_output[OUTPUT_SIZE][OUTPUT_SIZE]) { for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { int sum = 0; for (int m = 0; m < KERNEL_SIZE; m++) { for (int n = 0; n < KERNEL_SIZE; n++) { sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n]; } } expected_output[i][j] = sum; } } } // 自动化测试功能 bool run_test(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE]) { pixel_int hardware_output[OUTPUT_SIZE][OUTPUT_SIZE]; pixel_int expected_output[OUTPUT_SIZE][OUTPUT_SIZE]; // 计算软件期望结果 calculate_expected_output(input, kernel, expected_output); // 执行硬件加速函数 conv2d(input, kernel, hardware_output); // 比较硬件结果和期望结果 for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { if (hardware_output[i][j] != expected_output[i][j]) { // 打印出错的输入和输出 cout << "Error at position (" << i << "," << j << "). Expected: " << expected_output[i][j] << ", Got: " << hardware_output[i][j] << endl; cout << "Input:" << endl; for (int ki = 0; ki < INPUT_SIZE; ki++) { for (int kj = 0; kj < INPUT_SIZE; kj++) { cout << input[ki][kj] << " "; } cout << endl; } cout << "Kernel:" << endl; for (int ki = 0; ki < KERNEL_SIZE; ki++) { for (int kj = 0; kj < KERNEL_SIZE; kj++) { cout << kernel[ki][kj] << " "; } cout << endl; } cout << "Hardware Output:" << endl; print_output(hardware_output); cout << "Expected Output:" << endl; print_output(expected_output); return false; } } } return true; } int main() { // 生成随机测试数据 pixel_int input[INPUT_SIZE][INPUT_SIZE]; pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE]; // 运行多次测试 const int num_tests = 10; // 测试次数 bool all_tests_passed = true; for (int test = 0; test < num_tests; test++) { // 随机生成输入数据和卷积核 for (int i = 0; i < INPUT_SIZE; i++) { for (int j = 0; j < INPUT_SIZE; j++) { input[i][j] = rand() % 256; // 随机值在0到255之间 } } for (int i = 0; i < KERNEL_SIZE; i++) { for (int j = 0; j < KERNEL_SIZE; j++) { kernel[i][j] = rand() % 3 - 1; // 随机值在-1到1之间 } } // 运行测试 if (!run_test(input, kernel)) { all_tests_passed = false; cout << "Test " << test << " failed!" << endl; } else { cout << "Test " << test << " passed!" << endl; } } if (all_tests_passed) { cout << "All tests passed!" << endl; } else { cout << "Some tests failed!" << endl; } return all_tests_passed ? 0 : 1; }
通过这些代码,我们可以在Vitis HLS中实现一个基础的Conv2D操作,并进行仿真和验证。接下来,我们将对该函数进行优化,以提高其性能。
二、优化实现
为了提高Conv2D函数的性能,我们引入多种优化技术,包括循环展开、流水线和数据流优化。
2.1 优化后的Conv2D函数实现
在优化过程中,我们使用了以下HLS指令:
#pragma HLS UNROLL
用于展开循环,提高并行度。#pragma HLS PIPELINE
用于创建流水线,提高执行速度。#pragma HLS ARRAY_PARTITION
用于划分数组,最大化并行内存访问。
以下是优化后的代码:
#include "conv2d.h" void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) { // 将输入缓冲存入BRAM以进行快速访问的操作 #pragma HLS ARRAY_PARTITION variable=input complete dim=2 #pragma HLS ARRAY_PARTITION variable=kernel complete dim=2 #pragma HLS ARRAY_PARTITION variable=output complete dim=2 for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { #pragma HLS PIPELINE int sum = 0; for (int m = 0; m < KERNEL_SIZE; m++) { #pragma HLS UNROLL for (int n = 0; n < KERNEL_SIZE; n++) { #pragma HLS UNROLL sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n]; } } output[i][j] = sum; } } }
然后可以初步修改:
#include "conv2d.h" void conv2d(pixel_int input_stream[INPUT_SIZE * INPUT_SIZE], pixel_int kernel_stream[KERNEL_SIZE * KERNEL_SIZE], pixel_int output_stream[OUTPUT_SIZE * OUTPUT_SIZE]) { #pragma HLS INTERFACE axis port=input_stream #pragma HLS INTERFACE axis port=kernel_stream #pragma HLS INTERFACE axis port=output_stream #pragma HLS INTERFACE s_axilite port=return pixel_int input[INPUT_SIZE][INPUT_SIZE]; pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE]; pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]; // 将输入流转换为本地数组 for (int i = 0; i < INPUT_SIZE; i++) { for (int j = 0; j < INPUT_SIZE; j++) { #pragma HLS PIPELINE II=1 input[i][j] = input_stream[i * INPUT_SIZE + j]; } } // 将卷积核流转换为本地数组 for (int i = 0; i < KERNEL_SIZE; i++) { for (int j = 0; j < KERNEL_SIZE; j++) { #pragma HLS PIPELINE II=1 kernel[i][j] = kernel_stream[i * KERNEL_SIZE + j]; } } // 主卷积操作 for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { #pragma HLS PIPELINE II=1 int sum = 0; for (int m = 0; m < KERNEL_SIZE; m++) { for (int n = 0; n < KERNEL_SIZE; n++) { #pragma HLS UNROLL sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n]; } } output[i][j] = sum; } } // 输出结果流 for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { #pragma HLS PIPELINE II=1 output_stream[i * OUTPUT_SIZE + j] = output[i][j]; } } }
2.2 优化效果
通过应用上述优化技术,Conv2D函数的计算性能显著提高。具体的优化效果可以通过下述步骤进行验证:
创建一个新的Vitis HLS工程,将优化后的代码复制进去。
构建工程并设置优化后的Conv2D函数为顶层函数。
综合和评价,查看优化对资源利用率和性能的影响。
使用上述优化,我们可以在FPGA平台上加速Conv2D操作,从而提高整个卷积神经网络模型的推理速度。
分析串行代码的时钟周期数
1. 主卷积操作
外层循环
i
:迭代次数:
OUTPUT_SIZE
次外层循环
j
:迭代次数:
OUTPUT_SIZE
次内层循环
m
:迭代次数:
KERNEL_SIZE
次内层循环
n
:迭代次数:
KERNEL_SIZE
次
在内层循环 m
和 n
中,每次迭代进行一次乘法和累加操作,需要一个时钟周期来完成。
所以,总体的时钟周期数为:
时钟周期总数 = OUTPUT_SIZE * OUTPUT_SIZE * KERNEL_SIZE * KERNEL_SIZE
2. 其他步骤
在串行计算中,未包含输入和输出流的操作,因为在 FPGA 硬件中这些操作也是串行化的,我们忽略此部分的性能损耗。
代入实际值计算时钟周期数
假设 INPUT_SIZE = 32
,KERNEL_SIZE = 3
,STRIDE = 1
,那么 OUTPUT_SIZE
可以通过以下公式计算:
OUTPUT_SIZE = (INPUT_SIZE - KERNEL_SIZE) / STRIDE + 1 = 30
代入实际的值,我们计算总时钟周期数:
时钟周期总数 = OUTPUT_SIZE * OUTPUT_SIZE * KERNEL_SIZE * KERNEL_SIZE = 30 * 30 * 3 * 3 = 30 * 30 * 9 = 8100
FPGA初步优化后的时钟周期:
进一步优化之后:
#ifndef CONV2D_H #define CONV2D_H #include "ap_int.h" #define INPUT_SIZE 32 #define KERNEL_SIZE 3 #define STRIDE 1 #define OUTPUT_SIZE ((INPUT_SIZE - KERNEL_SIZE) / STRIDE + 1) typedef ap_uint<8> pixel_int; void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]); #endif
#include "conv2d.h" void conv2d(pixel_int input[INPUT_SIZE][INPUT_SIZE], pixel_int kernel[KERNEL_SIZE][KERNEL_SIZE], pixel_int output[OUTPUT_SIZE][OUTPUT_SIZE]) { // 使用BRAM存储 #pragma HLS ARRAY_PARTITION variable=input complete dim=2 #pragma HLS ARRAY_PARTITION variable=kernel complete dim=2 #pragma HLS ARRAY_PARTITION variable=output complete dim=2 // 优化流水线 #pragma HLS INLINE off for (int i = 0; i < OUTPUT_SIZE; i++) { for (int j = 0; j < OUTPUT_SIZE; j++) { #pragma HLS PIPELINE II=1 int sum = 0; for (int m = 0; m < KERNEL_SIZE; m++) { #pragma HLS UNROLL factor=3 for (int n = 0; n < KERNEL_SIZE; n++) { #pragma HLS UNROLL factor=3 sum += input[i * STRIDE + m][j * STRIDE + n] * kernel[m][n]; } } output[i][j] = sum; } } }
好的,已经初见效果了!
本程序还可以进一步优化,我们将在后面统一发布的完整版本中进行更新。