voidinitlog(intdev){if(sizeof(struct logheader)>= BSIZE)panic("\tinitlog: logheader is too big.\n");struct superblock sb;initlock(&log.lock,"log");readsb(dev,&sb);
log.start= sb.logstart;
log.size= sb.nlog;
log.dev= dev;recover_from_log();cprintf("initlog: success.\n");}
其中,super block 保存了磁盘的布局信息,详见注释:
inc/fs.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/** Disk layout:
* [boot block | super block | log | inode blocks | free bit map | data blocks]
** mkfs computes the super block and builds an initial file system.
* The super block describes the disk layout:
*/structsuperblock{uint32_t size;// Size of file system image (blocks)
uint32_t nblocks;// Number of data blocks
uint32_t ninodes;// Number of inodes
uint32_t nlog;// Number of log blocks
uint32_t logstart;// Block number of first log block
uint32_t inodestart;// Block number of first inode block
uint32_t bmapstart;// Block number of first free map block
};
我们先调用函数 readsb 读取 super block。
kern/fs.c
1
2
3
4
5
6
7
8
9
10
/** Read the super block.
*/voidreadsb(intdev,struct superblock*sb){struct buf* b =bread(dev,1);memmove(sb, b->data,sizeof(*sb));brelse(b);}
然后我们利用 super block 中的信息初始化 log,其中 log 的结构如下所示:
kern/log.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/** Contents of the header block, used for both the on-disk header block
* and to keep track in memory of logged block # before commit.
*/structlogheader{int n;int block[LOGSIZE];};structlog{struct spinlock lock;int start;int size;int outstanding;// How many FS sys calls are executing.
int committing;// In commit(), please wait.
int dev;struct logheader lh;} log;
/** Read the log header from disk into the in-memory log header.
*/staticvoidread_head(){struct buf* buf =bread(log.dev, log.start);struct logheader* lh =(struct logheader*)(buf->data);
log.lh.n= lh->n;for(int i =0; i < log.lh.n;++i) log.lh.block[i]= lh->block[i];brelse(buf);}
/** Copy committed blocks from log to their home location.
*/staticvoidinstall_trans(){for(int i =0; i < log.lh.n;++i){struct buf* log_buf =bread(log.dev, log.start+ i +1);struct buf* dst_buf =bread(log.dev, log.lh.block[i]);memmove(dst_buf->data, log_buf->data, BSIZE);brelse(log_buf);bwrite(dst_buf);brelse(dst_buf);}}
/** Write in-memory log header to disk.
* This is the true point at which the
* current transaction commits.
*/staticvoidwrite_head(){struct buf* buf =bread(log.dev, log.start);struct logheader* lh =(struct logheader*)(buf->data);
lh->n = log.lh.n;for(int i =0; i < log.lh.n;++i) lh->block[i]= log.lh.block[i];bwrite(buf);brelse(buf);}
/** Caller has modified b->data and is done with the buffer.
* Record the block number and pin in the cache with B_DIRTY.
* commit() / write_log() will do the disk write.
** log_write() replaces bwrite(); a typical use is:
* bp = bread(...)
* modify bp->data[]
* log_write(bp)
* brelse(bp)
*/voidlog_write(struct buf*b){if(log.lh.n>= LOGSIZE || log.lh.n>= log.size-1)panic("\tlog_write: transaction is too big.\n");if(log.outstanding<1)panic("\tlog_write: outside of transaction.\n");acquire(&log.lock);int i =0;for(; i < log.lh.n;++i){if(log.lh.block[i]== b->blockno)break;// log absorption
}if(i == log.lh.n){
log.lh.block[i]= b->blockno;++log.lh.n;}
b->flags |= B_DIRTY;// prevent eviction
release(&log.lock);}
/** Called at the start of each FS system call.
*/voidbegin_op(){acquire(&log.lock);while(1){if(log.committing){sleep(&log,&log.lock);}elseif(log.lh.n+(log.outstanding+1)* MAXOPBLOCKS > LOGSIZE){// This op might exhaust log space; wait for commit.
sleep(&log,&log.lock);}else{++log.outstanding;release(&log.lock);break;}}}
/** Called at the end of each FS system call.
* Commits if this was the last outstanding operation.
*/voidend_op(){int do_commit =0;acquire(&log.lock);--log.outstanding;if(log.committing)panic("\tend_op: log is committing.\n");if(!log.outstanding){
do_commit =1;
log.committing=1;}else{// begin_op() may be waiting for log space, and decrementing
// log.outstanding has decreased the amount of reserved space.
wakeup(&log);}release(&log.lock);if(do_commit){commit();acquire(&log.lock);
log.committing=0;wakeup(&log);release(&log.lock);}}
/** Write in-memory log header to disk.
* This is the true point at which the
* current transaction commits.
*/staticvoidwrite_head(){struct buf* buf =bread(log.dev, log.start);struct logheader* lh =(struct logheader*)(buf->data);
lh->n = log.lh.n;for(int i =0; i < log.lh.n;++i) lh->block[i]= log.lh.block[i];bwrite(buf);brelse(buf);}
/** In-memory copy of an inode.
*/structinode{uint32_t dev;// Device number
uint32_t inum;// Inode number
int ref;// Reference count
struct sleeplock lock;// Protects everything below here
int valid;// Inode has been read from disk?
uint16_t type;// Copy of disk inode
uint16_t major;uint16_t minor;uint16_t nlink;uint32_t size;uint32_t addrs[NDIRECT +1];};
/** Allocate an inode on device dev.
** Mark it as allocated by giving it type type.
* Returns an unlocked but allocated and referenced inode.
*/struct inode*ialloc(uint32_tdev,uint16_ttype){for(int inum =1; inum < sb.ninodes;++inum){struct buf* bp =bread(dev,IBLOCK(inum, sb));struct dinode* dip =(struct dinode*)bp->data + inum % IPB;if(!dip->type){// a free inode
memset(dip,0,sizeof(*dip));
dip->type = type;log_write(bp);// mark it allocated on the disk
brelse(bp);returniget(dev, inum);}brelse(bp);}panic("\tialloc: no inodes.\n");return0;}
/** Find the inode with number inum on device dev
* and return the in-memory copy. Does not lock
* the inode and does not read it from disk.
*/staticstruct inode*iget(uint32_tdev,uint32_tinum){acquire(&icache.lock);// Is the inode already cached?
struct inode* empty =NULL;for(int i =0; i < NINODE;++i){struct inode* ip =&icache.inode[i];if(ip->ref >0&& ip->dev == dev && ip->inum == inum){
ip->ref++;release(&icache.lock);return ip;}if(!empty &&!ip->ref) empty = ip;// remember empty slot
}// Recycle an inode cache entry.
if(!empty)panic("\tiget: no inodes.\n");struct inode* ip = empty;
ip->dev = dev;
ip->inum = inum;
ip->ref =1;
ip->valid =0;release(&icache.lock);return ip;}
/** Copy a modified in-memory inode to disk.
** Must be called after every change to an ip->xxx field
* that lives on disk, since i-node cache is write-through.
* Caller must hold ip->lock.
*/voidiupdate(struct inode*ip){struct buf* bp =bread(ip->dev,IBLOCK(ip->inum, sb));struct dinode* dip =(struct dinode*)bp->data + ip->inum % IPB;
dip->type = ip->type;
dip->major = ip->major;
dip->minor = ip->minor;
dip->nlink = ip->nlink;
dip->size = ip->size;memmove(dip->addrs, ip->addrs,sizeof(ip->addrs));log_write(bp);brelse(bp);}
/** Increment reference count for ip.
* Returns ip to enable ip = idup(ip1) idiom.
*/struct inode*idup(struct inode*ip){acquire(&icache.lock);
ip->ref++;release(&icache.lock);return ip;}
/** Drop a reference to an in-memory inode.
** If that was the last reference, the inode cache entry can
* be recycled.
* If that was the last reference and the inode has no links
* to it, free the inode (and its content) on disk.
* All calls to iput() must be inside a transaction in
* case it has to free the inode.
*/voidiput(struct inode*ip){acquire(&icache.lock);if(ip->ref ==1&& ip->valid &&!ip->nlink){// ip->ref == 1 means no other process can have ip locked,
// so this acquiresleep() won't block (or deadlock).
acquiresleep(&ip->lock);release(&icache.lock);// inode has no links and no other references: truncate and free.
itrunc(ip);
ip->type =0;iupdate(ip);
ip->valid =0;releasesleep(&ip->lock);acquire(&icache.lock);}
ip->ref--;release(&icache.lock);}
/** Truncate inode (discard contents).
** Only called when the inode has no links
* to it (no directory entries referring to it)
* and has no in-memory reference to it (is
* not an open file or current directory).
*/staticvoiditrunc(struct inode*ip){for(int i =0; i < NDIRECT;++i){if(ip->addrs[i]){bfree(ip->dev, ip->addrs[i]);
ip->addrs[i]=0;}}if(ip->addrs[NDIRECT]){struct buf* bp =bread(ip->dev, ip->addrs[NDIRECT]);uint32_t* a =(uint32_t*)bp->data;for(int j =0; j < NINDIRECT;++j){if(a[j])bfree(ip->dev, a[j]);}brelse(bp);bfree(ip->dev, ip->addrs[NDIRECT]);
ip->addrs[NDIRECT]=0;}
ip->size =0;iupdate(ip);}
其中,函数 bfree 用于释放一个 block,将其在 bitmap 中标记为未使用。
kern/fs.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/** Free a disk block.
*/staticvoidbfree(intdev,uint32_tb){struct buf* bp =bread(dev,BBLOCK(b, sb));int bi = b % BPB;int m =1<<(bi %8);if(!(bp->data[bi /8]& m))panic("\tbfree: freeing a free block.\n");
bp->data[bi /8]&=~m;log_write(bp);brelse(bp);}
1.4.8 iunlockput
函数 iunlockput 是 iunlock + iput 的别名。
kern/fs.c
1
2
3
4
5
6
7
8
9
/** Common idiom: unlock, then put.
*/voidiunlockput(struct inode*ip){iunlock(ip);iput(ip);}
1.4.9 stati
函数 stati 的主要工作是复制 inode 的元数据(metadata)到 stat 结构,届时用户程序可以通过 stat 系统调用读取。
/** Copy stat information from inode.
* Caller must hold ip->lock.
*/voidstati(struct inode*ip,struct stat*st){// FIXME: Support other fields in stat.
st->st_dev = ip->dev;
st->st_ino = ip->inum;
st->st_nlink = ip->nlink;
st->st_size = ip->size;switch(ip->type){case T_FILE: st->st_mode = S_IFREG;break;case T_DIR: st->st_mode = S_IFDIR;break;case T_DEV: st->st_mode =0;break;default:panic("\tstati: unexpected stat type %d.\n", ip->type);}}
/** Read data from inode.
* Caller must hold ip->lock.
*/ssize_treadi(struct inode*ip,char*dst,size_toff,size_tn){if(ip->type == T_DEV){if(ip->major <0|| ip->major >= NDEV ||!devsw[ip->major].read)return-1;return devsw[ip->major].read(ip, dst, n);}if(off > ip->size || off + n < off)return-1;if(off + n > ip->size) n = ip->size - off;for(size_t tot =0, m =0; tot < n; tot += m, off += m, dst += m){struct buf* bp =bread(ip->dev,bmap(ip, off / BSIZE));
m =min(n - tot, BSIZE - off % BSIZE);memmove(dst, bp->data + off % BSIZE, m);brelse(bp);}return n;}
/** Inode content
** The content (data) associated with each inode is stored
* in blocks on the disk. The first NDIRECT block numbers
* are listed in ip->addrs[]. The next NINDIRECT blocks are
* listed in block ip->addrs[NDIRECT].
** Return the disk block address of the nth block in inode ip.
* If there is no such block, bmap allocates one.
*/staticuint32_tbmap(struct inode*ip,uint32_tbn){if(bn < NDIRECT){// Load direct block, allocating if necessary.
uint32_t addr = ip->addrs[bn];if(!addr) ip->addrs[bn]= addr =balloc(ip->dev);return addr;}
bn -= NDIRECT;if(bn < NINDIRECT){// Load indirect block, allocating if necessary.
uint32_t addr = ip->addrs[NDIRECT];if(!addr) ip->addrs[NDIRECT]= addr =balloc(ip->dev);struct buf* bp =bread(ip->dev, addr);uint32_t* a =(uint32_t*)bp->data;
addr = a[bn];if(!addr){
a[bn]= addr =balloc(ip->dev);log_write(bp);}brelse(bp);return addr;}panic("\tbmap: out of range.\n");return0;}
/** Allocate a zeroed disk block.
*/staticuint32_tballoc(uint32_tdev){for(int b =0; b < sb.size; b += BPB){struct buf* bp =bread(dev,BBLOCK(b, sb));for(int bi =0; bi < BPB && b + bi < sb.size;++bi){int m =1<<(bi %8);if(!(bp->data[bi /8]& m)){// Is block free?
bp->data[bi /8]|= m;// Mark block in use.
log_write(bp);brelse(bp);bzero(dev, b + bi);return b + bi;}}brelse(bp);}panic("\tballoc: out of blocks.\n");return0;}
函数 bzero 用于清空一个 block。
kern/fs.c
1
2
3
4
5
6
7
8
9
10
11
/** Zero a block.
*/staticvoidbzero(intdev,intbno){struct buf* b =bread(dev, bno);memset(b->data,0, BSIZE);log_write(b);brelse(b);}
/** Write data to inode.
* Caller must hold ip->lock.
*/ssize_twritei(struct inode*ip,char*src,size_toff,size_tn){if(ip->type == T_DEV){if(ip->major <0|| ip->major >= NDEV ||!devsw[ip->major].write)return-1;return devsw[ip->major].write(ip, src, n);}if(off > ip->size || off + n < off)return-1;if(off + n > MAXFILE * BSIZE)return-1;for(size_t tot =0, m =0; tot < n; tot += m, off += m, src += m){struct buf* bp =bread(ip->dev,bmap(ip, off / BSIZE));
m =min(n - tot, BSIZE - off % BSIZE);memmove(bp->data + off % BSIZE, src, m);log_write(bp);brelse(bp);}if(n >0&& off > ip->size){
ip->size = off;iupdate(ip);}return n;}
/** Close file f. (Decrement ref count, close when reaches 0.)
*/voidfile_close(struct file*f){acquire(&ftable.lock);if(f->ref <1)panic("\tfile_close: invalid file.\n");if(--f->ref >0){release(&ftable.lock);return;}struct file ff =*f;
f->ref =0;
f->type = FD_NONE;release(&ftable.lock);if(ff.type== FD_INODE){begin_op();iput(ff.ip);end_op();}else{panic("\tfile_close: unsupported type.\n");}}
1.7.5 file_stat
函数 file_stat 的主要工作是调用函数 stati 读取文件的元数据。
kern/file.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/** Get metadata about file f.
*/intfile_stat(struct file*f,struct stat*st){if(f->type == FD_INODE){ilock(f->ip);stati(f->ip, st);iunlock(f->ip);return0;}return-1;}
1.7.6 file_read
函数 file_read 的主要工作是对于普通文件,调用函数 readi 从文件中读取数据。
kern/file.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/** Read from file f.
*/ssize_tfile_read(struct file*f,char*addr,ssize_tn){if(!f->readable)return-1;if(f->type == FD_INODE){ilock(f->ip);int r =readi(f->ip, addr, f->off, n);if(r >0) f->off += r;iunlock(f->ip);return r;}panic("\tfile_read: unsupported type.\n");return0;}
/** Write to file f.
*/ssize_tfile_write(struct file*f,char*addr,ssize_tn){if(!f->writable)return-1;if(f->type == FD_INODE){// Write a few blocks at a time to avoid exceeding the maximum log
// transaction size, including i-node, indirect block, allocation
// blocks, and 2 blocks of slop for non-aligned writes. This really
// belongs lower down, since writei() might be writing a device like the
// console.
int max =((MAXOPBLOCKS -4)/2)*512;int i =0;while(i < n){int n1 = n - i;if(n1 > max) n1 = max;begin_op();ilock(f->ip);int r =writei(f->ip, addr + i, f->off, n1);if(r >0) f->off += r;iunlock(f->ip);end_op();if(r <0)break;if(r != n1)panic("\tfile_write: partial data written.\n");
i += r;}return i == n ? n :-1;}panic("\tfile_write: unsupported type.\n");return0;}
voidtrap(struct trapframe*tf){int ec =resr()>> EC_SHIFT, iss =resr()& ISS_MASK;lesr(0);// Clear esr.
switch(ec){case EC_UNKNOWN:interrupt(tf);break;case EC_SVC64:if(!iss){/* Jump to syscall to handle the system call from user process */
tf->x0 =syscall1(tf);}else{cprintf("trap: unexpected svc iss 0x%x\n", iss);}break;default:panic("\ttrap: unexpected irq.\n");}}
2.4 syscall.c
我们根据之前保存在寄存器 X8 的值,可以得到当前的 system call number。随后利用函数指针表 syscalls,即可进行相应的系统调用。