CSAPP - Cache Lab

Computer Systems A Programmer's Perspective 书籍课程配套实验

PartA

实现一个cache的模拟程序。(实验环境OSX)

通过读取traces/文件夹下的某个内存访问记录,模拟cache的miss、hit、eviction。

内存访问有四种:

  1. I <address>, size    => 加载指令
  2. M <address>, size    => 修改内存中的值,其效果相当于L指令后跟一个S指令
  3. L <address>, size    => 读取内存
  4. S <address>, size    => 写内存

我们实现的cache模拟器要支持如下的命令行参数:

  • -h : 显示帮助
  • -v :以冗余格式显示内存访问的情况
  • -s : Sn = 2s 指定了cache中的组数为Sn
  • -E : 相联度,每组多少行
  • -b : cache块大小 B = 2b
  • -t <tracefile> : 指定cache读取的内存访问序列所在文件

可以先使用csim-refs来看看预期的程序运行结果。在csim.c文件中完成这个实验。

Cache数据结构的定义

struct cacheLine
{
    __uint64_t tag;
	bool valid;
	short* blocks;
	int age;  // 记录被访问时的年龄,年龄值越小,越老
};

typedef struct set {
	struct cacheLine* lines;
	int ageCount; // cache组的访问年龄,每次访问都增加
}Set;

typedef struct cache {
    Set* sets;
    int setNum;
    int blockSize;
    int lineNum;
}Cache;

LRU算法的实现,参考了Wikipedia上的一个解释。如下是一个cache组,每组4行,访问序列是ABCDEDF

由于cache是线性表的结构,无法使用类似FILO的方式实现LRU。我这边使用了一个年龄计数法,每次访问一个cache行,该行的年龄就会增加(设置为当前的cache组年龄,随访问次数增加而增加),cache组中年龄值最小的就是应该被淘汰的(eviction)

解析命令行参数

#define true 1
#define false 0

#define DEBUG 0
#define Dprintf(fmt, ...) if (DEBUG) printf(fmt, __VA_ARGS__)

typedef int bool;

typedef struct argsinfo {
	bool verbose;		// 冗长输出
	int setBitNum;			// 组位数
	int cacheLineNum;	// 每组行数
	int blockBitNum;		// 块位数
	char* file;			// .trace文件
}ArgsInfo;

bool parseArgs(ArgsInfo* argsInfo, int argc, char** argv) {
	if (argc != 9 && argc != 10) {
		printf("Usage: %s [-v] -s  -E  -b  -t \n", argv[0]);
		exit(0);
	}
	int i;
	for (i = 1; i < argc; i++) {
		if (!strcmp(argv[i], "-v")) {
			argsInfo->verbose = true;
		} else if (!strcmp(argv[i], "-s")) {
			i++;
			argsInfo->setBitNum = atoi(argv[i]);
		} else if (!strcmp(argv[i], "-E")) {
			i++;
			argsInfo->cacheLineNum = atoi(argv[i]);
		} else if (!strcmp(argv[i], "-b")) {
			i++;
			argsInfo->blockBitNum = atoi(argv[i]);
		} else if (!strcmp(argv[i], "-t")) {
			i++;
			argsInfo->file = (char *)malloc(strlen(argv[i]) + 1);
			// printf("sizeof *argv[i] = %lu\n", sizeof(*argv[i])); // 1 for *argv[i], 8 for argv[i]
			// printf("strlen argv = %lu\n", strlen(argv[i]));
			strcpy(argsInfo->file, argv[i]);
			// printf("strlen file = %lu\n", strlen(argsInfo->file)); // same as argv[i]
		} else {
			printf("Wrong Parameter!\n");
			exit(-1);
		}
	}
	Dprintf("Verbose: %d, setBitNum = %d, cacheLineNum = %d, blockBitNum = %d, file = %s\n",
		argsInfo->verbose, argsInfo->setBitNum, argsInfo->cacheLineNum, argsInfo->blockBitNum, argsInfo->file);
	return true;
}

cache结构的初始化和释放

bool initCache(Cache* cache, ArgsInfo* argsInfo){
    cache->setNum = pow(2, argsInfo->setBitNum);
    cache->lineNum = argsInfo->cacheLineNum;
    cache->blockSize = pow(2, argsInfo->blockBitNum);
    cache->sets = (Set *)malloc(sizeof (Set) * cache->setNum);

	for (int i = 0;i < cache->setNum; i++) {
	    // 给每一个组分配cache行
        cache->sets[i].lines = (struct cacheLine *)malloc(argsInfo->cacheLineNum * sizeof(struct cacheLine));
        cache->sets[i].ageCount = 0;
		for (int j = 0; j < argsInfo->cacheLineNum; j++) {
            cache->sets[i].lines[j].blocks = NULL; // We don't really store the data. (short *)malloc(cache->blockSize * sizeof (short));
            cache->sets[i].lines[j].tag = 0;
            cache->sets[i].lines[j].valid = false;
            cache->sets[i].lines[j].age = 0;
//            memset(cache->sets[i].lines[j].blocks, 0, sizeof(short) * cache->blockSize);
		}
	}
	Dprintf("Init cache success.\nsetNum = %d, lineNum = %d, blockSize = %d\n",
        cache->setNum, cache->lineNum, cache->blockSize);
	return true;
}

void freeCache(Cache* cache){
    for (int i = 0; i < cache->setNum; ++i) {
//        for (int j = 0; j < cache->lineNum; ++j) {
//            free(cache->sets[i].lines[j].blocks);
//        }
        free(cache->sets[i].lines);
    }
    free(cache->sets);
}

在cache中查找数据、空行、寻找需要替换的行

struct cacheLine* findData(Cache* cache, __uint64_t tag, __uint64_t setIndex){
    struct cacheLine* lines = cache->sets[setIndex].lines;
    for (int i = 0; i < cache->lineNum; ++i) {
        if (lines[i].valid && lines[i].tag == tag) {
            return lines+i;
        }
    }
    return NULL;
}

struct cacheLine* findEmptyLine(Cache* cache, __uint64_t setIndex) {
    struct cacheLine* lines = cache->sets[setIndex].lines;
    for (int i = 0; i < cache->lineNum; ++i) {
        if (lines[i].valid == false) {
            return lines+i;
        }
    }
    return NULL;
}

struct cacheLine* findEvictionLine(Cache* cache, __uint64_t setIndex) {  // LRU
    // all the cache line is valid, find the line that has the lowest age value
    struct cacheLine* lines = cache->sets[setIndex].lines;
    int minIndex = 0;
    int minAge = lines[0].age;
    for (int i = 1; i < cache->lineNum; ++i) {
        if (lines[i].age < minAge){
            minAge = lines[i].age;
            minIndex = i;
        }
    }
    return lines+minIndex;
}

主函数

int main(int argc, char** argv)
{
	// parse args
	ArgsInfo argsInfo;
	parseArgs(&argsInfo, argc, argv);
	Cache cache;
	initCache(&cache, &argsInfo);

	FILE* fp = fopen(argsInfo.file, "r");
    if(!fp) {
        perror("File opening failed");
        return EXIT_FAILURE;
    }

    char *buf = NULL;
    size_t len;
    AccessInfo accessInfo;
//    Dprintf("sizeof unsigned = %lu\n", sizeof(unsigned )); // 4
//    unsigned a = 0x80000000;
//    Dprintf("a >> 31 = 0x%x\n", a >> 31); // 0x1
//    int a = 0x80000000;
//    Dprintf("a >> 31 = 0x%x\n", a >> 31); // 0xffffffff

    __int64_t min64 = 1; // 有符号数
    __uint64_t temp = (min64 << 63) >> (argsInfo.setBitNum - 1); // 算数右移
    Dprintf("temp = 0x%llx\n", temp);
    __uint64_t mask = ((__uint64_t)temp) >> (64 - argsInfo.setBitNum); // setBitNum 位数的掩码,逻辑右移
    Dprintf("mask = 0x%llx\n", mask);

    int hits = 0, misses = 0, evictions = 0;
    while (getline(&buf, &len, fp) != -1) {
        if (parseAccessInfo(buf, &accessInfo)) {
            __uint64_t tagNSet = accessInfo.address >> argsInfo.blockBitNum;
            __uint64_t tag = (tagNSet & ~(mask)) >> argsInfo.setBitNum;
            __uint64_t setIndex = tagNSet & mask;
            Dprintf("tag = 0x%llx, set = 0x%llx\n", tag, setIndex);

            if (argsInfo.verbose) {
                printf("%c %llx,%d ", accessInfo.op, accessInfo.address, accessInfo.size);
            }

            switch (accessInfo.op) {
                case 'S':
                case 'L': { // remember to use brackets to include the whole case
                    struct cacheLine *line = NULL;
                    if ((line = findData(&cache, tag, setIndex))) { // remember to use little brackets to include the assignment
                        hits++;
                        line->age = cache.sets[setIndex].ageCount;
                        if (argsInfo.verbose) printf("hit\n");
                    } else {
                        misses++;
                        if (argsInfo.verbose) printf("miss");
                        struct cacheLine *modifyLine = NULL;
                        if ((modifyLine = findEmptyLine(&cache, setIndex))) {
                            modifyLine->valid = true;
                            modifyLine->tag = tag;
                            modifyLine->age = cache.sets[setIndex].ageCount;
                            if (argsInfo.verbose) printf("\n");
                        } else {
                            evictions++;
                            if (argsInfo.verbose) printf(" eviction\n");
                            struct cacheLine *evictedLine = findEvictionLine(&cache, setIndex);
                            Dprintf("Evict: set=%llu, tag=%llx\n", setIndex, evictedLine->tag);
                            evictedLine->valid = true;
                            evictedLine->tag = tag;
                            evictedLine->age = cache.sets[setIndex].ageCount;
                        }
                    }
                    cache.sets[setIndex].ageCount++;
                    break;
                }
                case 'M': {
                    struct cacheLine *line = NULL;
                    if ((line = findData(&cache, tag, setIndex))) {
                        hits++;
                        if (argsInfo.verbose) printf("hit ");
                        line->age = cache.sets[setIndex].ageCount;
                        line->valid = true;
                        hits++;
                        if (argsInfo.verbose) printf("hit\n"); // hit by store
                    } else {
                        misses++;
                        if (argsInfo.verbose) printf("miss ");
                        struct cacheLine* modifyLine = NULL;
                        if ((modifyLine = findEmptyLine(&cache, setIndex))) {
                            modifyLine->valid = true;
                            modifyLine->age = cache.sets[setIndex].ageCount;
                            modifyLine->tag = tag;
                            hits++;
                            if (argsInfo.verbose) printf("hit\n");
                        } else {
                            evictions++;
                            if (argsInfo.verbose) printf("eviction ");
                            struct cacheLine* evictedLine = findEvictionLine(&cache, setIndex);
                            Dprintf("Evict: set=%llu, tag=%llx\n", setIndex, evictedLine->tag);
                            evictedLine->valid = true;
                            evictedLine->tag = tag;
                            evictedLine->age = cache.sets[setIndex].ageCount;
                            hits++;
                            if (argsInfo.verbose) printf("hit\n");
                        }
                    }
                    cache.sets[setIndex].ageCount++;
                    break;
                }
                default:
                    exit(-1);
            }
        }
    }
	freeCache(&cache);
    printSummary(hits, misses, evictions);
    return 0;
}

bool parseAccessInfo(char* buf, AccessInfo* accessInfo){
    if (buf[0] == 'I') { // ignore the instruction access
        return false;
    }
    accessInfo->op = buf[1];
    accessInfo->address = strtol(&buf[3], NULL, 16);
    char *ptr = strtok(buf, ",");
//    Dprintf("%s", ptr);
    ptr = strtok(NULL, " ");
    accessInfo->size = (int)strtol(ptr, NULL, 10);
    Dprintf("\nAccessInfo: op = %c, address = 0x%llx, size = %d\n",
            accessInfo->op, accessInfo->address, accessInfo->size);
    return true;
}

为了使用test-csim,发现linux不支持getline函数,所以使用fgets替代,目的都是读取一行输入;同时linux也不支持__VA_ARGS__的宏。做了这些改动之后在linux服务器上运行得到正确结果:

sugar@ubuntuServer:~/csappLab/cachelab-handout$ make
gcc -g -Wall -Werror -std=c99 -m64 -o csim csim.c cachelab.c -lm 
gcc -g -Wall -Werror -std=c99 -m64 -O0 -c trans.c
gcc -g -Wall -Werror -std=c99 -m64 -o test-trans test-trans.c cachelab.c trans.o 
gcc -g -Wall -Werror -std=c99 -m64 -O0 -o tracegen tracegen.c trans.o cachelab.c
# Generate a handin tar file each time you compile
tar -cvf sugar-handin.tar  csim.c trans.c 
csim.c
trans.c
sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-csim 
                        Your simulator     Reference simulator
Points (s,E,b)    Hits  Misses  Evicts    Hits  Misses  Evicts
     3 (1,1,1)       9       8       6       9       8       6  traces/yi2.trace
     3 (4,2,4)       4       5       2       4       5       2  traces/yi.trace
     3 (2,1,4)       2       3       1       2       3       1  traces/dave.trace
     3 (2,1,3)     167      71      67     167      71      67  traces/trans.trace
     3 (2,2,3)     201      37      29     201      37      29  traces/trans.trace
     3 (2,4,3)     212      26      10     212      26      10  traces/trans.trace
     3 (5,1,5)     231       7       0     231       7       0  traces/trans.trace
     6 (5,1,5)  265189   21775   21743  265189   21775   21743  traces/long.trace
    27

TEST_CSIM_RESULTS=27

Part B

实验准备:需要程序运行时内存访问序列的获取,linux可能需要安装valgrind

编写矩阵转置算法,使得该算法在一个直接映射的cache上有较高的命中率。cache的参数为:32组、每组1行、块大小为32bytes。

一开始只是使用了分块的思想,每次处理一个8 * 8大小的int类型矩阵,选择8是因为cache每行只能放下8个int元素。

char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
    // cache 共32组,每组1行,每行32byte(8个int)
    int tmp;
    int bsize = 8; // 小块矩阵为 8 * 8
    for (int i = 0; i < N; i += bsize) {
        for (int j = 0; j < M; j += bsize) {
            for (int bi = i; bi < (i + bsize < N ? i + bsize : N); ++bi) {
                for (int bj = j; bj < (j + bsize < M ? j + bsize : M); ++bj) {
                    B[bj][bi] = A[bi][bj];
                }
            }
        }
    }
}

但是运行32*32和61*67都没有到达满分:

对于32 * 32的矩阵
sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-trans -M 32 -N 32

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1710, misses:343, evictions:311

sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-trans -M 61 -N 67

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:6061, misses:2118, evictions:2086

后来仔细观察矩阵转置的内存地址访问序列发现,访问地址的后12位指定了cache的访问组号和标记等信息,而后12位的高9位指定了访问的行和列。

S 0034a65c,4
 L 0030ac30,4
 S 0034a6dc,4
 L 0030ac34,4
 S 0034a75c,4
 L 0030ac38,4
 S 0034a7dc,4
 L 0030ac3c,4
 S 0034a85c,4
 L 0030a8c0,4
 S 0034a8c0,4
 L 0030a8c4,4
 S 0034a940,4
 L 0030a8c8,4
 S 0034a9c0,4
 L 0030a8cc,4
 S 0034aa40,4

后12位指定了访问的行列信息:
    地址位数
    ----->12 11<------------>2 1<->0
    数组基址   row * 32 + col     in

所以访问两个数组同行同列的元素一定会导致cache的miss和eviction,所以对对角线元素特殊处理

char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    // cache 共32组,每组1行,每行32byte(8个int)
    int tmp;
    int bsize = 8; // 小块矩阵为 8 * 8
    for (int i = 0; i < N; i += bsize) {
        for (int j = 0; j < M; j += bsize) {
            for (int bi = i; bi < (i + bsize < N ? i + bsize : N); ++bi) {
                int index = 0;
                for (int bj = j; bj < (j + bsize < M ? j + bsize : M); ++bj) {
                    if (bi != bj) {
                        B[bj][bi] = A[bi][bj];
                    } else {
                        tmp = A[bi][bj];
                        index = bi;
                    }
                }
                // 完成A的一行元素转置后,再把对角线上的元素赋给B,否则先访问位于同行列的B后,把A的行给驱逐
                if (i == j) {
                    B[index][index] = tmp;
                }
            }
        }
    }
}

可以对32*32的运算miss降低到300以下:

sugar@ubuntuServer:~/csappLab/cachelab-handout$ ./test-trans -M 32 -N 32

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1766, misses:287, evictions:255

ddfd对于M=61、N=67的矩阵,要求比较宽松,使用16 * 16的小矩阵即可:

if (M == 61 && N == 67) {
    int i, j, k, l;
    for (i = 0; i < N; i+=16)
    {
        for (j = 0; j < M; j+=16)
        {
            for (k = i; k < i + 16 && k < N; k++)
            {
                for (l = j; l < j + 16 && l < M; l++)
                {
                    B[l][k] = A[k][l];
                }
            }
        }
    }
}

对于64 * 64的矩阵,由于矩阵的一行有64个int,如果还是使用8*8的小矩阵,那么小矩阵的第0行和第4行会存在同一个cache组内(根据地址判断),导致冲突。使用4 * 4的小矩阵作为替代(参考

if (M == 64 && N == 64) {
    int i, j, k;
    int v0, v1, v2, v3;
    for (i = 0; i < N; i+=4)
    {
        for (j = 0; j < M; j+=4)
        {
            for (k = i; k < i + 4; k++)
            {
                v0 = A[k][j];
                v1 = A[k][j+1];
                v2 = A[k][j+2];
                v3 = A[k][j+3];
                B[j][k] = v0;
                B[j+1][k] = v1;
                B[j+2][k] = v2;
                B[j+3][k] = v3;
            }
        }
    }
}