CSAPP - Cache Lab
Computer Systems A Programmer's Perspective 书籍课程配套实验
PartA
实现一个cache的模拟程序。(实验环境OSX)
通过读取traces/文件夹下的某个内存访问记录,模拟cache的miss、hit、eviction。
内存访问有四种:
I <address>, size
=> 加载指令M <address>, size
=> 修改内存中的值,其效果相当于L指令后跟一个S指令L <address>, size
=> 读取内存S <address>, size
=> 写内存
我们实现的cache模拟器要支持如下的命令行参数:
- -h : 显示帮助
- -v :以冗余格式显示内存访问的情况
- -s : Sn = 2s 指定了cache中的组数为Sn
- -E : 相联度,每组多少行
- -b : cache块大小 B = 2b
-t <tracefile>
: 指定cache读取的内存访问序列所在文件
可以先使用csim-refs来看看预期的程序运行结果。在csim.c文件中完成这个实验。
Cache数据结构的定义
struct cacheLine
{
__uint64_t tag;
bool valid;
short* blocks;
int age; // 记录被访问时的年龄,年龄值越小,越老
};
typedef struct set {
struct cacheLine* lines;
int ageCount; // cache组的访问年龄,每次访问都增加
}Set;
typedef struct cache {
Set* sets;
int setNum;
int blockSize;
int lineNum;
}Cache;
LRU算法的实现,参考了Wikipedia上的一个解释。如下是一个cache组,每组4行,访问序列是ABCDEDF
由于cache是线性表的结构,无法使用类似FILO的方式实现LRU。我这边使用了一个年龄计数法,每次访问一个cache行,该行的年龄就会增加(设置为当前的cache组年龄,随访问次数增加而增加),cache组中年龄值最小的就是应该被淘汰的(eviction)
解析命令行参数
#define true 1
#define false 0
#define DEBUG 0
#define Dprintf(fmt, ...) if (DEBUG) printf(fmt, __VA_ARGS__)
typedef int bool;
typedef struct argsinfo {
bool verbose; // 冗长输出
int setBitNum; // 组位数
int cacheLineNum; // 每组行数
int blockBitNum; // 块位数
char* file; // .trace文件
}ArgsInfo;
bool parseArgs(ArgsInfo* argsInfo, int argc, char** argv) {
if (argc != 9 && argc != 10) {
printf("Usage: %s [-v] -s -E -b -t \n", argv[0]);
exit(0);
}
int i;
for (i = 1; i < argc; i++) {
if (!strcmp(argv[i], "-v")) {
argsInfo->verbose = true;
} else if (!strcmp(argv[i], "-s")) {
i++;
argsInfo->setBitNum = atoi(argv[i]);
} else if (!strcmp(argv[i], "-E")) {
i++;
argsInfo->cacheLineNum = atoi(argv[i]);
} else if (!strcmp(argv[i], "-b")) {
i++;
argsInfo->blockBitNum = atoi(argv[i]);
} else if (!strcmp(argv[i], "-t")) {
i++;
argsInfo->file = (char *)malloc(strlen(argv[i]) + 1);
// printf("sizeof *argv[i] = %lu\n", sizeof(*argv[i])); // 1 for *argv[i], 8 for argv[i]
// printf("strlen argv = %lu\n", strlen(argv[i]));
strcpy(argsInfo->file, argv[i]);
// printf("strlen file = %lu\n", strlen(argsInfo->file)); // same as argv[i]
} else {
printf("Wrong Parameter!\n");
exit(-1);
}
}
Dprintf("Verbose: %d, setBitNum = %d, cacheLineNum = %d, blockBitNum = %d, file = %s\n",
argsInfo->verbose, argsInfo->setBitNum, argsInfo->cacheLineNum, argsInfo->blockBitNum, argsInfo->file);
return true;
}
cache结构的初始化和释放
bool initCache(Cache* cache, ArgsInfo* argsInfo){
cache->setNum = pow(2, argsInfo->setBitNum);
cache->lineNum = argsInfo->cacheLineNum;
cache->blockSize = pow(2, argsInfo->blockBitNum);
cache->sets = (Set *)malloc(sizeof (Set) * cache->setNum);
for (int i = 0;i < cache->setNum; i++) {
// 给每一个组分配cache行
cache->sets[i].lines = (struct cacheLine *)malloc(argsInfo->cacheLineNum * sizeof(struct cacheLine));
cache->sets[i].ageCount = 0;
for (int j = 0; j < argsInfo->cacheLineNum; j++) {
cache->sets[i].lines[j].blocks = NULL; // We don't really store the data. (short *)malloc(cache->blockSize * sizeof (short));
cache->sets[i].lines[j].tag = 0;
cache->sets[i].lines[j].valid = false;
cache->sets[i].lines[j].age = 0;
// memset(cache->sets[i].lines[j].blocks, 0, sizeof(short) * cache->blockSize);
}
}
Dprintf("Init cache success.\nsetNum = %d, lineNum = %d, blockSize = %d\n",
cache->setNum, cache->lineNum, cache->blockSize);
return true;
}
void freeCache(Cache* cache){
for (int i = 0; i < cache->setNum; ++i) {
// for (int j = 0; j < cache->lineNum; ++j) {
// free(cache->sets[i].lines[j].blocks);
// }
free(cache->sets[i].lines);
}
free(cache->sets);
}
在cache中查找数据、空行、寻找需要替换的行
struct cacheLine* findData(Cache* cache, __uint64_t tag, __uint64_t setIndex){
struct cacheLine* lines = cache->sets[setIndex].lines;
for (int i = 0; i < cache->lineNum; ++i) {
if (lines[i].valid && lines[i].tag == tag) {
return lines+i;
}
}
return NULL;
}
struct cacheLine* findEmptyLine(Cache* cache, __uint64_t setIndex) {
struct cacheLine* lines = cache->sets[setIndex].lines;
for (int i = 0; i < cache->lineNum; ++i) {
if (lines[i].valid == false) {
return lines+i;
}
}
return NULL;
}
struct cacheLine* findEvictionLine(Cache* cache, __uint64_t setIndex) { // LRU
// all the cache line is valid, find the line that has the lowest age value
struct cacheLine* lines = cache->sets[setIndex].lines;
int minIndex = 0;
int minAge = lines[0].age;
for (int i = 1; i < cache->lineNum; ++i) {
if (lines[i].age < minAge){
minAge = lines[i].age;
minIndex = i;
}
}
return lines+minIndex;
}
主函数
int main(int argc, char** argv)
{
// parse args
ArgsInfo argsInfo;
parseArgs(&argsInfo, argc, argv);
Cache cache;
initCache(&cache, &argsInfo);
FILE* fp = fopen(argsInfo.file, "r");
if(!fp) {
perror("File opening failed");
return EXIT_FAILURE;
}
char *buf = NULL;
size_t len;
AccessInfo accessInfo;
// Dprintf("sizeof unsigned = %lu\n", sizeof(unsigned )); // 4
// unsigned a = 0x80000000;
// Dprintf("a >> 31 = 0x%x\n", a >> 31); // 0x1
// int a = 0x80000000;
// Dprintf("a >> 31 = 0x%x\n", a >> 31); // 0xffffffff
__int64_t min64 = 1; // 有符号数
__uint64_t temp = (min64 << 63) >> (argsInfo.setBitNum - 1); // 算数右移
Dprintf("temp = 0x%llx\n", temp);
__uint64_t mask = ((__uint64_t)temp) >> (64 - argsInfo.setBitNum); // setBitNum 位数的掩码,逻辑右移
Dprintf("mask = 0x%llx\n", mask);
int hits = 0, misses = 0, evictions = 0;
while (getline(&buf, &len, fp) != -1) {
if (parseAccessInfo(buf, &accessInfo)) {
__uint64_t tagNSet = accessInfo.address >> argsInfo.blockBitNum;
__uint64_t tag = (tagNSet & ~(mask)) >> argsInfo.setBitNum;
__uint64_t setIndex = tagNSet & mask;
Dprintf("tag = 0x%llx, set = 0x%llx\n", tag, setIndex);
if (argsInfo.verbose) {
printf("%c %llx,%d ", accessInfo.op, accessInfo.address, accessInfo.size);
}
switch (accessInfo.op) {
case 'S':
case 'L': { // remember to use brackets to include the whole case
struct cacheLine *line = NULL;
if ((line = findData(&cache, tag, setIndex))) { // remember to use little brackets to include the assignment
hits++;
line->age = cache.sets[setIndex].ageCount;
if (argsInfo.verbose) printf("hit\n");
} else {
misses++;
if (argsInfo.verbose) printf("miss");
struct cacheLine *modifyLine = NULL;
if ((modifyLine = findEmptyLine(&cache, setIndex))) {
modifyLine->valid = true;
modifyLine->tag = tag;
modifyLine->age = cache.sets[setIndex].ageCount;
if (argsInfo.verbose) printf("\n");
} else {
evictions++;
if (argsInfo.verbose) printf(" eviction\n");
struct cacheLine *evictedLine = findEvictionLine(&cache, setIndex);
Dprintf("Evict: set=%llu, tag=%llx\n", setIndex, evictedLine->tag);
evictedLine->valid = true;
evictedLine->tag = tag;
evictedLine->age = cache.sets[setIndex].ageCount;
}
}
cache.sets[setIndex].ageCount++;
break;
}
case 'M': {
struct cacheLine *line = NULL;
if ((line = findData(&cache, tag, setIndex))) {
hits++;
if (argsInfo.verbose) printf("hit ");
line->age = cache.sets[setIndex].ageCount;
line->valid = true;
hits++;
if (argsInfo.verbose) printf("hit\n"); // hit by store
} else {
misses++;
if (argsInfo.verbose) printf("miss ");
struct cacheLine* modifyLine = NULL;
if ((modifyLine = findEmptyLine(&cache, setIndex))) {
modifyLine->valid = true;
modifyLine->age = cache.sets[setIndex].ageCount;
modifyLine->tag = tag;
hits++;
if (argsInfo.verbose) printf("hit\n");
} else {
evictions++;
if (argsInfo.verbose) printf("eviction ");
struct cacheLine* evictedLine = findEvictionLine(&cache, setIndex);
Dprintf("Evict: set=%llu, tag=%llx\n", setIndex, evictedLine->tag);
evictedLine->valid = true;
evictedLine->tag = tag;
evictedLine->age = cache.sets[setIndex].ageCount;
hits++;
if (argsInfo.verbose) printf("hit\n");
}
}
cache.sets[setIndex].ageCount++;
break;
}
default:
exit(-1);
}
}
}
freeCache(&cache);
printSummary(hits, misses, evictions);
return 0;
}
bool parseAccessInfo(char* buf, AccessInfo* accessInfo){
if (buf[0] == 'I') { // ignore the instruction access
return false;
}
accessInfo->op = buf[1];
accessInfo->address = strtol(&buf[3], NULL, 16);
char *ptr = strtok(buf, ",");
// Dprintf("%s", ptr);
ptr = strtok(NULL, " ");
accessInfo->size = (int)strtol(ptr, NULL, 10);
Dprintf("\nAccessInfo: op = %c, address = 0x%llx, size = %d\n",
accessInfo->op, accessInfo->address, accessInfo->size);
return true;
}
为了使用test-csim,发现linux不支持getline函数,所以使用fgets替代,目的都是读取一行输入;同时linux也不支持__VA_ARGS__的宏。做了这些改动之后在linux服务器上运行得到正确结果:
sugar@ubuntuServer:~/csappLab/cachelab-handout$ make
gcc -g -Wall -Werror -std=c99 -m64 -o csim csim.c cachelab.c -lm
gcc -g -Wall -Werror -std=c99 -m64 -O0 -c trans.c
gcc -g -Wall -Werror -std=c99 -m64 -o test-trans test-trans.c cachelab.c trans.o
gcc -g -Wall -Werror -std=c99 -m64 -O0 -o tracegen tracegen.c trans.o cachelab.c
# Generate a handin tar file each time you compile
tar -cvf sugar-handin.tar csim.c trans.c
csim.c
trans.c
sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-csim
Your simulator Reference simulator
Points (s,E,b) Hits Misses Evicts Hits Misses Evicts
3 (1,1,1) 9 8 6 9 8 6 traces/yi2.trace
3 (4,2,4) 4 5 2 4 5 2 traces/yi.trace
3 (2,1,4) 2 3 1 2 3 1 traces/dave.trace
3 (2,1,3) 167 71 67 167 71 67 traces/trans.trace
3 (2,2,3) 201 37 29 201 37 29 traces/trans.trace
3 (2,4,3) 212 26 10 212 26 10 traces/trans.trace
3 (5,1,5) 231 7 0 231 7 0 traces/trans.trace
6 (5,1,5) 265189 21775 21743 265189 21775 21743 traces/long.trace
27
TEST_CSIM_RESULTS=27
Part B
实验准备:需要程序运行时内存访问序列的获取,linux可能需要安装valgrind
编写矩阵转置算法,使得该算法在一个直接映射的cache上有较高的命中率。cache的参数为:32组、每组1行、块大小为32bytes。
一开始只是使用了分块的思想,每次处理一个8 * 8大小的int类型矩阵,选择8是因为cache每行只能放下8个int元素。
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
// cache 共32组,每组1行,每行32byte(8个int)
int tmp;
int bsize = 8; // 小块矩阵为 8 * 8
for (int i = 0; i < N; i += bsize) {
for (int j = 0; j < M; j += bsize) {
for (int bi = i; bi < (i + bsize < N ? i + bsize : N); ++bi) {
for (int bj = j; bj < (j + bsize < M ? j + bsize : M); ++bj) {
B[bj][bi] = A[bi][bj];
}
}
}
}
}
但是运行32*32和61*67都没有到达满分:
对于32 * 32的矩阵
sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-trans -M 32 -N 32
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1710, misses:343, evictions:311
sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-trans -M 61 -N 67
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:6061, misses:2118, evictions:2086
后来仔细观察矩阵转置的内存地址访问序列发现,访问地址的后12位指定了cache的访问组号和标记等信息,而后12位的高9位指定了访问的行和列。
S 0034a65c,4
L 0030ac30,4
S 0034a6dc,4
L 0030ac34,4
S 0034a75c,4
L 0030ac38,4
S 0034a7dc,4
L 0030ac3c,4
S 0034a85c,4
L 0030a8c0,4
S 0034a8c0,4
L 0030a8c4,4
S 0034a940,4
L 0030a8c8,4
S 0034a9c0,4
L 0030a8cc,4
S 0034aa40,4
后12位指定了访问的行列信息:
地址位数
----->12 11<------------>2 1<->0
数组基址 row * 32 + col in
所以访问两个数组同行同列的元素一定会导致cache的miss和eviction,所以对对角线元素特殊处理
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
// cache 共32组,每组1行,每行32byte(8个int)
int tmp;
int bsize = 8; // 小块矩阵为 8 * 8
for (int i = 0; i < N; i += bsize) {
for (int j = 0; j < M; j += bsize) {
for (int bi = i; bi < (i + bsize < N ? i + bsize : N); ++bi) {
int index = 0;
for (int bj = j; bj < (j + bsize < M ? j + bsize : M); ++bj) {
if (bi != bj) {
B[bj][bi] = A[bi][bj];
} else {
tmp = A[bi][bj];
index = bi;
}
}
// 完成A的一行元素转置后,再把对角线上的元素赋给B,否则先访问位于同行列的B后,把A的行给驱逐
if (i == j) {
B[index][index] = tmp;
}
}
}
}
}
可以对32*32的运算miss降低到300以下:
sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-trans -M 32 -N 32
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1766, misses:287, evictions:255
ddfd对于M=61、N=67的矩阵,要求比较宽松,使用16 * 16的小矩阵即可:
if (M == 61 && N == 67) {
int i, j, k, l;
for (i = 0; i < N; i+=16)
{
for (j = 0; j < M; j+=16)
{
for (k = i; k < i + 16 && k < N; k++)
{
for (l = j; l < j + 16 && l < M; l++)
{
B[l][k] = A[k][l];
}
}
}
}
}
对于64 * 64的矩阵,由于矩阵的一行有64个int,如果还是使用8*8的小矩阵,那么小矩阵的第0行和第4行会存在同一个cache组内(根据地址判断),导致冲突。使用4 * 4的小矩阵作为替代(参考)
if (M == 64 && N == 64) {
int i, j, k;
int v0, v1, v2, v3;
for (i = 0; i < N; i+=4)
{
for (j = 0; j < M; j+=4)
{
for (k = i; k < i + 4; k++)
{
v0 = A[k][j];
v1 = A[k][j+1];
v2 = A[k][j+2];
v3 = A[k][j+3];
B[j][k] = v0;
B[j+1][k] = v1;
B[j+2][k] = v2;
B[j+3][k] = v3;
}
}
}
}