基于华莱士树+booth2位编码+32压缩器实现的乘法器(Verilog实现)
3-2压缩器实际上就是一个保存进位加法器(csa)。他的功能就是将3个操作数转换为2个操作数(不会溢出,最多1+1+1=10,2位足够了)。原理部分只做简要说明,想要深入理解的话可自行搜索资料或查看文章末尾的参考资料。华莱士树实际上就是将多个32压缩器拼接起来。8种booth_code对应的部分积操作。代码部分有详细注释,请结合注释理解代码。[4]CPU设计实战(汪文祥)第五章。压缩器内的逻辑运算
·
这里写自定义目录标题
基于booth2位编码+32压缩器+华莱士树实现的乘法器(Verilog实现)
原理部分
原理部分只做简要说明,想要深入理解的话可自行搜索资料或查看文章末尾的参考资料
booth2位编码
8种booth_code对应的部分积操作
32压缩器
3-2压缩器实际上就是一个保存进位加法器(csa)。他的功能就是将3个操作数转换为2个操作数(不会溢出,最多1+1+1=11
,2位足够了)。
压缩器内的逻辑运算如下
module adder (
input Add_A,
input Add_B,
input Add_Cin,
output Cout,
output Sum
);
assign Cout = (Add_A & Add_B) | (Add_Cin & (Add_A | Add_B));
assign Sum = Add_A ^ Add_B ^ Add_Cin;
// assign {Cout, Sum} = Add_A + Add_B + Add_Cin;
endmodule
wallace树
华莱士树实际上就是将多个32压缩器拼接起来。
本文使用的是17个3_2压缩器拼接生成一个1位华莱士树,华莱士树的接线图如下:
代码实现
代码部分有详细注释,请结合注释理解代码。
Wallace
module wallace (
input [31:0] mul1,
input [31:0] mul2,
output [63:0] result
);
// every 3 bits generate a boothcode, then shift 2 bits
wire [ 2:0] booth_code [15:0];
// mul1 2*mul1 mul1_complement 2*mul1_complement
// mul1 is the multiplicand
// mul2 is the multiplier
wire [31:0] mulX;
wire [32:0] mulX_2;
wire [31:0] mulX_com;
wire [32:0] mulX_com_2;
wire [31:0] mulY;
// Nsum is the partical product
// Csum is the carry of 3_2_compressor
// Ssum is the sum of 3_2_compressor
wire [63:0] Nsum [15:0];
wire [64:0] Csum [16:0];
wire [63:0] Ssum [16:0];
assign mulX = mul1;
assign mulX_2 = mulX<<1;
// assign mulX_2 = {mulX[30:0], 2'b00};
assign mulX_com = ~mulX;
assign mulX_com_2 = ~ (mulX<<1);
// assign mulX_com_2 = ~ ({mulX[30:0], 2'b00});
assign mulY = mul2;
// get 3bits booth_code, every 2 bits get a boothcode, so the total number is 32/2=16
genvar i;
generate
assign booth_code[0] = {mulY[1], mulY[0], 1'b0};
for (i=1; i<=15; i=i+1) begin
assign booth_code[i] = {mulY[2*i+1], mulY[2*i], mulY[2*i-1]};
end
endgenerate
// get partial product and carry
// carry = 1, means the mulX(multiplicand) is reversed
// when booth_code == 100/101/110, fill in 1 in the end of Csum
// (because previous step just reverse, didn't add 1, so use Csum to add 1)
generate
for (i = 0; i < 16; i = i + 1) begin
assign Nsum[i] = {64{(booth_code[i] == 3'b000)}} & 64'b0 |
{64{(booth_code[i] == 3'b001)}} & {{(32-2*i){mulX[31]}}, mulX, {2*i{1'b0}}} |
{64{(booth_code[i] == 3'b010)}} & {{(32-2*i){mulX[31]}}, mulX, {2*i{1'b0}}} |
{64{(booth_code[i] == 3'b011)}} & {{(32-2*i-1){mulX_2[32]}}, mulX_2, {2*i{1'b0}}} |
{64{(booth_code[i] == 3'b100)}} & {{(32-2*i-1){mulX_com_2[32]}}, mulX_com_2, {2*i{1'b1}}} |
{64{(booth_code[i] == 3'b101)}} & {{(32-2*i){mulX_com[31]}}, mulX_com, {2*i{1'b1}}} |
{64{(booth_code[i] == 3'b110)}} & {{(32-2*i){mulX_com[31]}}, mulX_com, {2*i{1'b1}}} |
{64{(booth_code[i] == 3'b111)}} & 64'b0;
// use Csum to the implement of +1, when taking complement
assign Csum[i][0] = (booth_code[i] == 3'b100) |
(booth_code[i] == 3'b101) |
(booth_code[i] == 3'b110) ;
end
endgenerate
// initial it as zero
assign Csum[16][0] = 1'b0;
// if use 42compressor, when 1+1+1+1, it will be 100,then the csum don't have enough bits to accommodata the carry!!!
// so, if we use 42compressor, we need to new a new carry2???? to accommodata anothor carry(Cout_tmp and Cout)
// now we use 3_2_compressor
generate
for(i=0;i<64;i=i+1)begin
adder adder0(.Add_A(Nsum[ 0][i]),.Add_B(Nsum[ 1][i]),.Add_Cin(Nsum[ 2][i]),.Sum(Ssum[ 0][i]),.Cout(Csum[ 0][i+1]));
adder adder1(.Add_A(Nsum[ 3][i]),.Add_B(Nsum[ 4][i]),.Add_Cin(Nsum[ 5][i]),.Sum(Ssum[ 1][i]),.Cout(Csum[ 1][i+1]));
adder adder2(.Add_A(Nsum[ 6][i]),.Add_B(Nsum[ 7][i]),.Add_Cin(Nsum[ 8][i]),.Sum(Ssum[ 2][i]),.Cout(Csum[ 2][i+1]));
adder adder3(.Add_A(Nsum[ 9][i]),.Add_B(Nsum[ 10][i]),.Add_Cin(Nsum[ 11][i]),.Sum(Ssum[ 3][i]),.Cout(Csum[ 3][i+1]));
adder adder4(.Add_A(Nsum[ 12][i]),.Add_B(Nsum[ 13][i]),.Add_Cin(Nsum[ 14][i]),.Sum(Ssum[ 4][i]),.Cout(Csum[ 4][i+1]));
adder adder5(.Add_A(Nsum[ 15][i]),.Add_B(1'b0 ),.Add_Cin(1'b0 ),.Sum(Ssum[ 5][i]),.Cout(Csum[ 5][i+1]));
adder adder6(.Add_A(Ssum[ 0][i]),.Add_B(Ssum[ 1][i]),.Add_Cin(Ssum[ 2][i]),.Sum(Ssum[ 6][i]),.Cout(Csum[ 6][i+1]));
adder adder7(.Add_A(Ssum[ 3][i]),.Add_B(Ssum[ 4][i]),.Add_Cin(Ssum[ 5][i]),.Sum(Ssum[ 7][i]),.Cout(Csum[ 7][i+1]));
adder adder8(.Add_A(Csum[ 0][i]),.Add_B(Csum[ 1][i]),.Add_Cin(Csum[ 2][i]),.Sum(Ssum[ 8][i]),.Cout(Csum[ 8][i+1]));
adder adder9(.Add_A(Csum[ 3][i]),.Add_B(Csum[ 4][i]),.Add_Cin(Csum[ 5][i]),.Sum(Ssum[ 9][i]),.Cout(Csum[ 9][i+1]));
adder adder10(.Add_A(Ssum[ 6][i]),.Add_B(Ssum[ 7][i]),.Add_Cin(Ssum[ 8][i]),.Sum(Ssum[10][i]),.Cout(Csum[10][i+1]));
adder adder11(.Add_A(Ssum[ 9][i]),.Add_B(Csum[ 6][i]),.Add_Cin(Csum[ 7][i]),.Sum(Ssum[11][i]),.Cout(Csum[11][i+1]));
adder adder12(.Add_A(Csum[ 8][i]),.Add_B(Csum[ 9][i]),.Add_Cin(1'b0 ),.Sum(Ssum[12][i]),.Cout(Csum[12][i+1]));
adder adder13(.Add_A(Ssum[ 10][i]),.Add_B(Ssum[ 11][i]),.Add_Cin(Ssum[ 12][i]),.Sum(Ssum[13][i]),.Cout(Csum[13][i+1]));
adder adder14(.Add_A(Csum[ 10][i]),.Add_B(Csum[ 11][i]),.Add_Cin(Csum[ 12][i]),.Sum(Ssum[14][i]),.Cout(Csum[14][i+1]));
adder adder15(.Add_A(Ssum[ 13][i]),.Add_B(Ssum[ 14][i]),.Add_Cin(Csum[ 13][i]),.Sum(Ssum[15][i]),.Cout(Csum[15][i+1]));
adder adder16(.Add_A(Ssum[ 15][i]),.Add_B(Csum[ 14][i]),.Add_Cin(Csum[ 15][i]),.Sum(Ssum[16][i]),.Cout(Csum[16][i+1]));
end
endgenerate
// get result
assign result = (Ssum[16] + Csum[16][63:0]);
// judge the result is right or wrong
wire [63:0] cmp;
wire [63:0] tmp;
assign tmp = {{32{mulX[31]}},mulX} * {{32{mulY[31]}},mulY};
assign cmp = (result == tmp);
endmodule
32compressor
module adder (
input Add_A,
input Add_B,
input Add_Cin,
output Cout,
output Sum
);
assign Cout = (Add_A & Add_B) | (Add_Cin & (Add_A | Add_B));
assign Sum = Add_A ^ Add_B ^ Add_Cin;
// assign {Cout, Sum} = Add_A + Add_B + Add_Cin;
endmodule
Testbench
`timescale 1ns / 1ps
module mul_tb();
reg [31: 0] mul1;
reg [31: 0] mul2;
wire [63: 0] ans;
wire cmp2;
reg [31: 0] mul11;
reg [31: 0] mul22;
wallace mul_wallace(
.mul1(mul1),
.mul2(mul2),
.result(ans)
);
initial begin
mul1=4'h2;
mul2=4'h3;
#1000 $finish;
end
always begin
#5;
mul1=$random();
mul2=$random();
end
参考资料
[1]压缩器原理: 讲解基于verilog的4-2压缩器和3-2压缩器的实现方式,实现华莱士树(Wallace Tree
[2]Booth2位编码链接: 定点乘法器----基4booth算法
[3]Wallace树接线: verilog语言设计的32位输入使用Booth两位一乘和华莱士树的定点补码乘法器(附参考仿真文件
[4]CPU设计实战(汪文祥)第五章
更多推荐
所有评论(0)