CSAPP - Arch Lab

Computer Systems A Programmer's Perspective 书籍课程配套实验

实验前的准备工作

按照archlab.pdf文件中的描述，需要在archlab-handout/sim目录下执行 make clean; make 命令，但会出现工具缺少、缺少运行库的问题，所以根据提示来安装工具：

$ sudo apt-get install bison flex

由于我是连接服务器来做实验的，所以没有GUI，根据sim目录下的MakeFile提示，注释掉有关TCL、tk的运行库。

# MakeFile
# Comment this out if you don't have Tcl/Tk on your system

# GUIMODE=-DHAS_GUI

# Modify the following line so that gcc can find the libtcl.so and
# libtk.so libraries on your system. You may need to use the -L option
# to tell gcc which directory to look in. Comment this out if you
# don't have Tcl/Tk.

# TKLIBS=-L/usr/lib -ltk -ltcl

# Modify the following line so that gcc can find the tcl.h and tk.h
# header files on your system. Comment this out if you don't have
# Tcl/Tk.

# TKINC=-isystem /usr/include/tcl8.5

Part A

A部分用来熟悉Y86-64的汇编，编写汇编程序，实现example.c中函数的功能。编写好***.ys程序后，使用yas汇编成***.yo，再使用yis运行***.yo文件。还要注意的点是程序最后都要留一行空行

sum_list函数

比较简单，程序框架可以看书的P252的内容。

	.pos 0
	irmovq stack, %rsp
	call main
	halt

# Array of elements
.align 8
ele1:
	.quad 0x00a
	.quad ele2
ele2:
	.quad 0x0b0
	.quad ele3
ele3:
	.quad 0xc00
	.quad 0

main:
	irmovq ele1, %rdi  # get the begin address of array
	call sum_list
	ret

sum_list:
	irmovq $0, %rax
	irmovq $8, %r10  # bias
test:
	rrmovq %rdi, %r8
	andq %r8, %r8
	jne loop
	ret
loop:
	mrmovq (%rdi), %r9
	addq %r9, %rax
	addq %r10, %rdi
	mrmovq (%rdi), %rdi
	jmp test

# stack start here
	.pos 0x200
stack:

执行结果

sugar@ubuntuServer:~/csappLab/archlab-handout/sim/misc$ ./yas sum.ys
sugar@ubuntuServer:~/csappLab/archlab-handout/sim/misc$ ./yis sum.yo
Stopped in 36 steps at PC = 0x13.  Status 'HLT', CC Z=1 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000cba
%rsp:	0x0000000000000000	0x0000000000000200
%r9:	0x0000000000000000	0x0000000000000c00
%r10:	0x0000000000000000	0x0000000000000008

Changes to memory:
0x01f0:	0x0000000000000000	0x000000000000005b
0x01f8:	0x0000000000000000	0x0000000000000013

rsum_list函数

用递归的方式实现链表求和，注意调用者保存的参数压到栈上。

	.pos 0
	irmovq stack, %rsp
	call main
	halt

# Array of elements
.align 8
ele1:
	.quad 0x00a
	.quad ele2
ele2:
	.quad 0x0b0
	.quad ele3
ele3:
	.quad 0xc00
	.quad 0

main:
	irmovq ele1, %rdi
	call rsum_list
	ret

rsum_list:
	rrmovq %rdi, %r8
	andq %r8, %r8
	jne recur
	irmovq $0, %rax
	ret
recur:
	pushq %r9
	mrmovq (%rdi), %r9
	mrmovq 8(%rdi), %rdi
	call rsum_list
	addq %r9, %rax
	popq %r9
	ret

	.pos 0x200
stack:

测试

sugar@ubuntuServer:~/csappLab/archlab-handout/sim/misc$ ./yas rsum.ys 
sugar@ubuntuServer:~/csappLab/archlab-handout/sim/misc$ ./yis rsum.yo
Stopped in 41 steps at PC = 0x13.  Status 'HLT', CC Z=0 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000cba
%rsp:	0x0000000000000000	0x0000000000000200

Changes to memory:
0x01c0:	0x0000000000000000	0x0000000000000093
0x01c8:	0x0000000000000000	0x00000000000000b0
0x01d0:	0x0000000000000000	0x0000000000000093
0x01d8:	0x0000000000000000	0x000000000000000a
0x01e0:	0x0000000000000000	0x0000000000000093
0x01f0:	0x0000000000000000	0x000000000000005b
0x01f8:	0x0000000000000000	0x0000000000000013

copy函数

类似memcpy，我这边犯了两个错误：

第一个就是读取src的地址时用成了mrmovq src, %rdi，正确应该是irmovq src, %rdi；

另一个是在递增src指针（保存在%rdi中）时我用了mrmovq 8(%rdi), %rdi，但是这个指令的语义是 rdi = *(rdi + 8)，正确应该是用立即数来递增rdi，假设$8存在%r8中，则递增指针应该是 addq %r8, %rdi。

	.pos 0
	irmovq stack, %rsp
	call main
	halt

# Source block
.align 8
src:
    .quad 0x00a
    .quad 0x0b0
    .quad 0xc00
# Destination block
dest:
    .quad 0x111
    .quad 0x222
    .quad 0x333

main:
	irmovq src, %rdi  # first parameter
	irmovq dest, %rsi  # seconde parameter
	irmovq $3, %rdx    # third parameter
	call copy_block
	ret

copy_block:
	irmovq $0, %rax
	irmovq $8, %r8
	irmovq $1, %r10
test:
	andq %rdx, %rdx
	jg loop
	ret
loop:
	mrmovq (%rdi), %r9  # get src val
	# mrmovq 8(%rdi), %rdi   # src++
	addq %r8, %rdi
	rmmovq %r9, (%rsi)  # *dest = val
	# mrmovq 8(%rsi), %rsi  # dest++
	addq %r8, %rsi
	xorq %r9, %rax  # result ^= val
	subq %r10, %rdx
	jmp test

	.pos 0x200
stack:

sugar@ubuntuServer:~/csappLab/archlab-handout/sim/misc$ ./yas copy.ys
sugar@ubuntuServer:~/csappLab/archlab-handout/sim/misc$ ./yis copy.yo
Stopped in 41 steps at PC = 0x13.  Status 'HLT', CC Z=1 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000cba
%rsp:	0x0000000000000000	0x0000000000000200
%rsi:	0x0000000000000000	0x0000000000000048
%rdi:	0x0000000000000000	0x0000000000000030
%r8:	0x0000000000000000	0x0000000000000008
%r9:	0x0000000000000000	0x0000000000000c00
%r10:	0x0000000000000000	0x0000000000000001

Changes to memory:
0x0030:	0x0000000000000111	0x000000000000000a
0x0038:	0x0000000000000222	0x00000000000000b0
0x0040:	0x0000000000000333	0x0000000000000c00
0x01f0:	0x0000000000000000	0x000000000000006f
0x01f8:	0x0000000000000000	0x0000000000000013

PartB

给Y86-64指令集架构增加一条iaddq的指令，指令格式可以参考书本P254，指令实现了将一个常数加到几个寄存器上。实验主要修改seq-full.hcl文件，仿照书上的表格（主要参考irmovq、OPq两条指令的每个阶段处理流程）写出iaddq的处理流程就可以了。

seq-full.hcl展示被修改部分的内容：

################ Fetch Stage     ###################################

# Determine instruction code
...
bool instr_valid = icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	       IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ};

# Does fetched instruction require a regid byte?
bool need_regids =
	icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ};

# Does fetched instruction require a constant word?
bool need_valC =
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ};
################ Decode Stage    ###################################
...
## What register should be used as the B source?
word srcB = [
	icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

## What register should be used as the E destination?
word dstE = [
	icode in { IRRMOVQ } && Cnd : rB;
	icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

################ Execute Stage   ###################################

## Select input A to ALU
word aluA = [
	icode in { IRRMOVQ, IOPQ } : valA;
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ} : valC;
	icode in { ICALL, IPUSHQ } : -8;
	icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		      IPUSHQ, IRET, IPOPQ, IIADDQ} : valB;
	icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];
...
## Should the condition codes be updated?
bool set_cc = icode in { IOPQ, IIADDQ };

PartC

这个部分是在流水线指令架构的基础上，对基准程序进行优化，其实对流水线的体系结构没有太多的设计（主要就是实现iaddq指令，类似partB，这里就不说了）

一些折腾：我本来是在linux服务器上跑这个实验的，但是某次我无意中在我本机（OSX）上make了以下，发现只有一个报错：/usr/bin/ld: cannot find -lfl 原来是编译生成yas时需要flex的动态链接库，但mac系统里自带了flex的可执行文件，我还没有找到对应的库，于是我又用brew另外安装了一个flex：

==> flex
flex is keg-only, which means it was not symlinked into /usr/local,
because macOS already provides this software and installing another version in
parallel can cause all kinds of trouble.

If you need to have flex first in your PATH, run:
  echo 'export PATH="/usr/local/opt/flex/bin:$PATH"' >> ~/.zshrc

For compilers to find flex you may need to set:
  export LDFLAGS="-L/usr/local/opt/flex/lib"
  export CPPFLAGS="-I/usr/local/opt/flex/include"

之后修改/sim/misc/makefile的LEXLIB为：LEXLIB = -L/usr/local/opt/flex/lib -lfl

就可以正常编译了。

如何验证PartC的答案：

1、首先是ncopy.ys的正确（driver的原理就是给数据段的src赋值，因为ncopy.ys的内容里面没有指定src的内容

unix> make drivers  
unix> ../misc/yis sdriver.yo  
unix> ../misc/yis ldriver.yo

%rax的结果是2，那么就说明你的y86汇编写的没问题

2、测试这段ys汇编在流水线指令集架构上的运行

make psim VERSION=full  
./psim -t sdriver.yo  
./psim -t ldriver.yo

由于sdriver和ldriver都是固定的元素数量,可以用perl脚本correntness.pl来生成不同元素数量的src的driver，相当于不停的调用 ./gen-driver.pl -f ncopy.ys -n K -rc > driver.ys 来测试

3、测试流水线新增指令后是否影响了原来的指令集（然后是pipe-full.hcl的流水线架构的Y86指令集的正确性）

unix> (cd ../ptest; make SIM=../pipe/psim TFLAGS=-i)

4、在新增了指令后的流水线上测试ncopy的代码

unix> ./correctness.pl -p

5、最后的评分用benchmark.pl来评定，用你的流水线架构跑ncopy.ys程序的CPE。

这边我用2 * 1展开循环，作为优化ncopy.ys

# You can modify this portion
	# Loop header
	xorq %rax,%rax		# count = 0;
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:
	iaddq $-1, %rdx     # limit = len - 1
	je Only
Loop:	
	mrmovq (%rdi), %r10	# read val from src...
	mrmovq 8(%rdi), %r9
	andq %r10, %r10		# val <= 0?
	jle Npos1
	iaddq $1, %rax
Npos1:
	rmmovq %r10, (%rsi)	# ...and store it to dst
	andq %r9, %r9		# val <= 0?
	jle Npos		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos:	
	rmmovq %r9, 8(%rsi)  # *(dest+1) = *(src+1)
	iaddq $-2, %rdx    # limit -= 2
	iaddq $16, %rdi		# src += 2
	iaddq $16, %rsi		# dst += 2
	andq %rdx,%rdx		# limit > 0?
	jg Loop			# if so, goto Loop:
	jl Done
Only:
	mrmovq (%rdi), %r10  # remain one
	andq %r10, %r10
	jle Npos2
	iaddq $1, %rax
Npos2:
	rmmovq %r10, (%rsi)
Done:
	ret

用driver测试这段代码的正确性：

➜  pipe git:(master) ✗ make drivers
./gen-driver.pl -n 4 -f ncopy.ys > sdriver.ys
../misc/yas sdriver.ys
./gen-driver.pl -n 63 -f ncopy.ys > ldriver.ys
../misc/yas ldriver.ys
➜  pipe git:(master) ✗ ../misc/yis sdriver.yo
Stopped in 40 steps at PC = 0x31.  Status 'HLT', CC Z=0 S=1 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000002
%rdx:	0x0000000000000000	0xffffffffffffffff
%rsp:	0x0000000000000000	0x00000000000001d0
%rsi:	0x0000000000000000	0x0000000000000148
%rdi:	0x0000000000000000	0x0000000000000118
%r9:	0x0000000000000000	0x0000000000000004
%r10:	0x0000000000000000	0x0000000000000003

Changes to memory:
0x0128:	0x0000000000cdefab	0xffffffffffffffff
0x0130:	0x0000000000cdefab	0xfffffffffffffffe
0x0138:	0x0000000000cdefab	0x0000000000000003
0x0140:	0x0000000000cdefab	0x0000000000000004
0x01c8:	0x0000000000000000	0x0000000000000031
➜  pipe git:(master) ✗ ../misc/yis ldriver.yo   
Stopped in 450 steps at PC = 0x31.  Status 'HLT', CC Z=0 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x000000000000001f
%rsp:	0x0000000000000000	0x0000000000000588
%rsi:	0x0000000000000000	0x00000000000004f8
%rdi:	0x0000000000000000	0x00000000000002e8
%r9:	0x0000000000000000	0x000000000000003e
%r10:	0x0000000000000000	0x000000000000003f

Changes to memory:
0x0308:	0x0000000000cdefab	0xffffffffffffffff
0x0310:	0x0000000000cdefab	0x0000000000000002
...
➜  pipe git:(master) ✗ ./correctness.pl 
Simulating with instruction set simulator yis
	ncopy
0	OK
1	OK
2	OK
3	OK
...
256	OK
68/68 pass correctness test

在pipe-full.hcl中实现iaddq指令后，测试实现的正确性

$ make psim VERSION=full  
$ ./psim -t sdriver.yo 
$ ./psim -t ldriver.yo
$ (cd ../y86-code; make testpsim) 
$ (cd ../ptest; make SIM=../pipe/psim TFLAGS=-i)
$ ./correctness.pl -p     
都通过后，使用benchmark.pl进行测试
得分20/60
Average CPE	9.48
Score	20.4/60.0

得分20😅，让我去网上看看大佬们都怎么搞的

Sugar's Blog