神经网络的FPGA实现:基础卷积操作(一) RGB三通道 Verilog HDL Xilinx VIVADO conv_pe.v文件在上述链接中
`timescale 1ns / 1ps module conv_layer#( parameter CHANNEL_IN=3, //输入图像的通道R G B parameter CHANNEL_OUT=1 //输出图像的通道 )( input clk,//时钟 input rst,//复位 input input_weight_en, //权值使能信号 input input_bias_en, //bias使能信号 //卷积核的输入 input [7:0] weight_ab, //单时刻写入的权值 input [7:0] bias_ab, //单时刻写入的bias output write_done_weight_bias,//权值和bias写完的信号 //输入特征图 input input_fmap_en,//特征输入使能信号 input [9*8*CHANNEL_IN-1:0]fmap,//特征输入3*3fmap,输入输出均为8bit,3通道 output output_en,//输出使能信号 output [8*CHANNEL_OUT-1:0] end_data//输出8bit,1通道 ); //memory 的类型 不可综合语法 若要综合需要改成RAM reg [7:0] weight [9*CHANNEL_IN*CHANNEL_OUT-1:0]; //weight 个数为9*CHANNEL_IN*CHANNEL_OUT个8bit的数据:9*CHANNEL_IN*CHANNEL_OUT个8位寄存器 reg [7:0] bias [CHANNEL_OUT-1:0]; //bias 个数为CHANNEL_OUT个8bit的数据:CHANNEL_OUT个8位寄存器 //写权值 reg [15:0]weight_count;//写权重计数 16位weight_count reg weight_write_done;//写权重完成 always @(posedge clk) begin if(rst) begin//复位 weight_count<= 0; weight_write_done<=0; end else begin//非复位 if(input_weight_en)begin//权重输入使能 weight[weight_count]<=weight_ab;//权重写入 weight_count<=weight_count+1;//权重写入计数 end if(weight_count==9*CHANNEL_IN*CHANNEL_OUT)begin//weight 个数为9*CHANNEL_IN*CHANNEL_OUT,权重全部写入 weight_write_done<=1;//权重写入完成 end end end //写偏置 reg [7:0]bias_count;//写偏置计数 reg bias_write_done;//写偏置完成 always @(posedge clk) begin if(rst) begin bias_count<= 0; bias_write_done<=0; end else begin if(input_bias_en)begin//偏置输入使能 bias[bias_count]<=bias_ab;//偏置写入 bias_count<=bias_count+1;//偏置写入计数 end if(bias_count==CHANNEL_OUT)begin//bias个数为CHANNEL_OUT,偏置全部写入 bias_write_done<=1;//偏置写入完成 end end end assign write_done_weight_bias=weight_write_done&bias_write_done; //参数传输完成 //fmap 的输入计算 //在此设置与输入通道数和输出通道数相关的 conv_pe 的个数为最快计算数据 // 实际工程中的conv_pe的个数与 资源 速度 有关系 需要整体考虑 //例如 此工程中的数据 输入通道为3 输出通道为1 设置为3*1个conv_pe wire valid_out_1,valid_out_2,valid_out_3;//每通道输出使能信号 wire [31:0]sum_data_1,sum_data_2,sum_data_3;//每通道输出数据 //3通道卷积 conv_pe uut_conv_pe_1( .clk(clk), .rst(rst), .input_en(input_fmap_en), .kernel_00(weight[0]), .kernel_01(weight[1]), .kernel_02(weight[2]), .kernel_10(weight[3]), .kernel_11(weight[4]), .kernel_12(weight[5]), .kernel_20(weight[6]), .kernel_21(weight[7]), .kernel_22(weight[8]), .fmap_00(fmap[7:0]), .fmap_01(fmap[15:8]), .fmap_02(fmap[23:16]), .fmap_10(fmap[31:24]), .fmap_11(fmap[39:32]), .fmap_12(fmap[47:40]), .fmap_20(fmap[55:48]), .fmap_21(fmap[63:56]), .fmap_22(fmap[71:64]), .valid_out(valid_out_1), .sum_data(sum_data_1) ); conv_pe uut_conv_pe_2( .clk(clk), .rst(rst), .input_en(input_fmap_en), .kernel_00(weight[9]), .kernel_01(weight[10]), .kernel_02(weight[11]), .kernel_10(weight[12]), .kernel_11(weight[13]), .kernel_12(weight[14]), .kernel_20(weight[15]), .kernel_21(weight[16]), .kernel_22(weight[17]), .fmap_00(fmap[79:72]), .fmap_01(fmap[87:80]), .fmap_02(fmap[95:88]), .fmap_10(fmap[103:96]), .fmap_11(fmap[111:104]), .fmap_12(fmap[119:112]), .fmap_20(fmap[127:120]), .fmap_21(fmap[135:128]), .fmap_22(fmap[143:136]), .valid_out(valid_out_2), .sum_data(sum_data_2) ); conv_pe uut_conv_pe_3( .clk(clk), .rst(rst), .input_en(input_fmap_en), .kernel_00(weight[18]), .kernel_01(weight[19]), .kernel_02(weight[20]), .kernel_10(weight[21]), .kernel_11(weight[22]), .kernel_12(weight[23]), .kernel_20(weight[24]), .kernel_21(weight[25]), .kernel_22(weight[26]), .fmap_00(fmap[151:144]), .fmap_01(fmap[159:152]), .fmap_02(fmap[167:160]), .fmap_10(fmap[175:168]), .fmap_11(fmap[183:176]), .fmap_12(fmap[191:184]), .fmap_20(fmap[199:192]), .fmap_21(fmap[207:200]), .fmap_22(fmap[215:208]), .valid_out(valid_out_3), .sum_data(sum_data_3) ); wire [31:0] sum_data_32; //3通道求和 wire signed [7:0]bias_temp;//临时存储 8位偏置数据 assign bias_temp=bias[0]; assign output_en=valid_out_1&valid_out_2&valid_out_3; //输出的使能信号 //将单个结果量化到0到255 wire [31:0]sum_data_1_Q,sum_data_2_Q,sum_data_3_Q;//每通道输出数据量化 assign sum_data_1_Q={24'd0,sum_data_1[7:0]}; assign sum_data_2_Q={24'd0,sum_data_2[7:0]}; assign sum_data_3_Q={24'd0,sum_data_3[7:0]}; //将最后的结果量化到-127到128 assign sum_data_32= (output_en==1)?sum_data_1_Q+sum_data_2_Q+sum_data_3_Q+{{24{bias_temp[7]}},bias_temp}:32'd0; //根据输出使能判断数据是否进行通道求和 assign end_data=sum_data_32[7:0];//通道求和数据输出 reg [31:0]conv_pe_count; always @(posedge clk) begin if(rst) begin conv_pe_count<= 0; end else begin if(output_en)begin conv_pe_count<=conv_pe_count+1; end end end endmodule需要imageBlueChannels.txt、imageGreenChannels.txt、imageRedChannels.txt,以及bias.txt、weight.txt文件;并放入工程文件夹下。
`timescale 1ns / 1ps module tb_conv_layer; reg clk; reg rst; initial begin rst = 1; #1000 rst = 0; end always begin: clk1_blk clk = 0; forever #5 clk = ~clk;//周期为10个时间单位的波 end localparam CHANNEL_IN=3; localparam CHANNEL_OUT=1; reg input_weight_en,input_bias_en,input_fmap_en;//权重,偏置,特征输入使能信号 reg [7:0] weight_ab,bias_ab;//单时刻写入的8位weight与bias reg [9*8*CHANNEL_IN-1:0]fmap_ab;//写入的特征 一次3通道被卷积位宽:(kernel=3*3)*(8bit/个)*通道数channel_in wire write_done_weight_bias,output_en;//参数写完使能,输出使能 wire [8*CHANNEL_OUT-1:0]end_data;//通道求和数据输出,输出8bit,1通道 conv_layer#( .CHANNEL_IN(CHANNEL_IN), .CHANNEL_OUT(CHANNEL_OUT) )uut_conv_layer( .clk(clk), .rst(rst), .input_weight_en(input_weight_en), .input_bias_en(input_bias_en), .weight_ab(weight_ab), .bias_ab(bias_ab), .write_done_weight_bias(write_done_weight_bias), .input_fmap_en(input_fmap_en), .fmap(fmap_ab), .output_en(output_en), .end_data(end_data) ); //实际的工程中传输数据到器件内的时候需要通过 串口 pcie 光口 网口等外部接口写入 //或者直接让权值存储到片内 但是一般数据较多 需要借助外部存储器如DDR进行权值缓存 reg [7:0] weight[9*CHANNEL_IN*CHANNEL_OUT-1:0]; //9*CHANNEL_IN*CHANNEL_OUT个 8位权重寄存器,每次卷积需要[(kernel*kernel)*channel_in]个权值 initial begin //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入 $readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//weight.txt",weight);// 将所有的数据输入到mem1中 end integer weight_count; always @(posedge clk) begin if(rst) begin weight_count<= 0; input_weight_en<=0; weight_ab<=0; end else begin if(weight_count<9*CHANNEL_IN*CHANNEL_OUT)begin//权重未输入完毕时 input_weight_en<=1;//权重输入使能 weight_count<= weight_count+1;//权重输入计数 weight_ab<=weight[weight_count];//单时刻写入的权重值 end else begin input_weight_en<=0; weight_ab<=0; end end end reg [7:0]bias[CHANNEL_OUT-1:0]; // initial begin //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入 $readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//bias.txt",bias);// 将所有的数据输入到mem1中 end integer bias_count; always @(posedge clk) begin if(rst) begin bias_count<= 0; input_bias_en<=0; bias_ab<=0; end else begin if(bias_count<CHANNEL_OUT)begin input_bias_en<=1; bias_count<= bias_count+1; bias_ab<=bias[bias_count]; end else begin input_bias_en<=0; bias_ab<=0; end end end localparam IMAGE_WIDTH=482; localparam IMAGE_HIGH=322; //一般写入特征图的值从外围存储器中获得 //写入fmap 的值 reg [7:0] fmap_R[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据 reg [7:0] fmap_G[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据 reg [7:0] fmap_B[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据 initial begin //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入 $readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageBlueChannels.txt",fmap_B); $readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageGreenChannels.txt",fmap_G); $readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageRedChannels.txt",fmap_R);//读取3通道数据 end integer i,j; reg [2:0]states; always @(posedge clk ) begin if(rst) begin i<=0; j<=0; states<=0; input_fmap_en<=0; fmap_ab<=0; end else if(write_done_weight_bias) begin//权重偏置写入完毕 case(states) 0:begin if(i<IMAGE_WIDTH-2)begin//卷积输出width=IMAGE_WIDTH-kernel_size+1=482-3+1=480 input_fmap_en<=1; fmap_ab={fmap_R[i+2+IMAGE_WIDTH*(j+2)],fmap_R[i+1+IMAGE_WIDTH*(j+2)],fmap_R[i+0+IMAGE_WIDTH*(j+2)],//i=0,j=0:R第三行前三个fmap[964]、fmap[965]、fmap[966] fmap_R[i+2+IMAGE_WIDTH*(j+1)],fmap_R[i+1+IMAGE_WIDTH*(j+1)],fmap_R[i+0+IMAGE_WIDTH*(j+1)],//i=0,j=0:R第二行前三个fmap[482]、fmap[483]、fmap[484] fmap_R[i+2+IMAGE_WIDTH*(j+0)],fmap_R[i+1+IMAGE_WIDTH*(j+0)],fmap_R[i+0+IMAGE_WIDTH*(j+0)],//i=0,j=0:R第一行前三个fmap[0]、fmap[1]、fmap[2] fmap_G[i+2+IMAGE_WIDTH*(j+2)],fmap_G[i+1+IMAGE_WIDTH*(j+2)],fmap_G[i+0+IMAGE_WIDTH*(j+2)], fmap_G[i+2+IMAGE_WIDTH*(j+1)],fmap_G[i+1+IMAGE_WIDTH*(j+1)],fmap_G[i+0+IMAGE_WIDTH*(j+1)], fmap_G[i+2+IMAGE_WIDTH*(j+0)],fmap_G[i+1+IMAGE_WIDTH*(j+0)],fmap_G[i+0+IMAGE_WIDTH*(j+0)], fmap_B[i+2+IMAGE_WIDTH*(j+2)],fmap_B[i+1+IMAGE_WIDTH*(j+2)],fmap_B[i+0+IMAGE_WIDTH*(j+2)], fmap_B[i+2+IMAGE_WIDTH*(j+1)],fmap_B[i+1+IMAGE_WIDTH*(j+1)],fmap_B[i+0+IMAGE_WIDTH*(j+1)], fmap_B[i+2+IMAGE_WIDTH*(j+0)],fmap_B[i+1+IMAGE_WIDTH*(j+0)],fmap_B[i+0+IMAGE_WIDTH*(j+0)]}; i<=i+1;//i依次等于0-480,j=0,卷积窗口右移。 states<=1; end else begin input_fmap_en<=0; end // else end // 0: 1:begin input_fmap_en<=0; if(i==IMAGE_WIDTH-2)begin//第一轮卷积结束 i<=0; j<=j+1;//下一轮,卷积核下移一行,窗口开始右移卷积 end if(j<IMAGE_HIGH-2)begin//卷积输出high=fmap_high-kernel_size+1 states<=0; end if((j==IMAGE_HIGH-3)&&(i==IMAGE_WIDTH-2))begin//卷积窗口遍历结束 states<=2; end end // 1: 2:begin//卷积结束,初始化 input_fmap_en<=0; fmap_ab<=0; end // 2: endcase // states end end integer end_temp; initial begin end_temp=$fopen("C://Users//mayn//Desktop//nn//conv//conv_layer//conv_layer_result.txt","w"); end always @(posedge clk) begin if(uut_conv_layer.output_en)begin//uut模块中输出使能为1,写入卷积结果数据 $fwrite(end_temp,"%h\n",$signed(uut_conv_layer.end_data)); end end endmodule