erofs-utils mkfs 概览 | TJ的技术博客

参考erofs-utils 1.2.1, 入口是erofs_mkfs_build_tree(), 不关注xattr/extent, 看之前我们再了解下inode:

Inodes store information about files and directories (folders), such as file ownership, access mode (read, write, execute permissions), and file type.
Each file is associated with an inode, which is identified by an integer, often referred to as an i-number or inode number.

ok, let’s start:

struct erofs_inode *erofs_mkfs_build_tree(struct erofs_inode *dir)
{
	...
	if (!S_ISDIR(dir->i_mode)) {
		if (S_ISLNK(dir->i_mode)) {
			char *const symlink = malloc(dir->i_size);

			if (!symlink)
				return ERR_PTR(-ENOMEM);
			ret = readlink(dir->i_srcpath, symlink, dir->i_size);
			if (ret < 0) {
				free(symlink);
				return ERR_PTR(-errno);
			}

			ret = erofs_write_file_from_buffer(dir, symlink);
			free(symlink);
			if (ret)
				return ERR_PTR(ret);
		} else {
			ret = erofs_write_file(dir);
			if (ret)
				return ERR_PTR(ret);
		}

		erofs_prepare_inode_buffer(dir);
		erofs_write_tail_end(dir);
		return dir;
	}

先看link file，走erofs_write_file_from_buffer()：

int erofs_write_file_from_buffer(struct erofs_inode *inode, char *buf)
{
	const unsigned int nblocks = erofs_blknr(inode->i_size);
	int ret;

	inode->datalayout = EROFS_INODE_FLAT_INLINE; //tj: inline data

	ret = __allocate_inode_bh_data(inode, nblocks);
	if (ret)
		return ret;

	if (nblocks)
		blk_write(buf, inode->u.i_blkaddr, nblocks);
	inode->idata_size = inode->i_size % EROFS_BLKSIZ; //tj: get tail-end size
	if (inode->idata_size) {
		inode->idata = malloc(inode->idata_size);
		if (!inode->idata)
			return -ENOMEM;
		memcpy(inode->idata, buf + blknr_to_addr(nblocks),
		       inode->idata_size);
	}
	return 0;
}

相关fields:

struct erofs_inode {
	...
	umode_t i_mode;
	erofs_off_t i_size; //tj: file size
	...

	union {
		u32 i_blkaddr; //tj: for uncompressed
		u32 i_blocks; //tj: for compressed
		u32 i_rdev;
	} u;

	unsigned char inode_isize;
	/* inline tail-end packing size */
	unsigned short idata_size;

	unsigned int xattr_isize;
	unsigned int extent_isize;

	void *idata; //tj: hold tail-end data

__allocate_inode_bh_data()用来分配data buffer，得到块首地址(inode->u.i_blkaddr)，涉及buffer cache，这里暂略过。

!link file(也应该就是regular file)走的是erofs_write_file():

int erofs_write_file(struct erofs_inode *inode)
{
	int ret, fd;

	if (!inode->i_size) {
		inode->datalayout = EROFS_INODE_FLAT_PLAIN;
		return 0;
	}

	if (cfg.c_compr_alg_master && erofs_file_is_compressible(inode)) {
		ret = erofs_write_compressed_file(inode);

		if (!ret || ret != -ENOSPC)
			return ret;
	}

	/* fallback to all data uncompressed */
	fd = open(inode->i_srcpath, O_RDONLY | O_BINARY);
	if (fd < 0)
		return -errno;

	ret = write_uncompressed_file_from_fd(inode, fd);
	close(fd);
	return ret;
}

先check有压缩就走压缩流程: erofs_write_compressed_file()，如果压缩失败就走uncompressed: write_uncompressed_file_from_fd()。

先看写压缩：

int erofs_write_compressed_file(struct erofs_inode *inode)
{
	...
	blkaddr = erofs_mapbh(bh->block, true);	/* start_blkaddr */
	ctx.blkaddr = blkaddr;
	ctx.metacur = compressmeta + Z_EROFS_LEGACY_MAP_HEADER_SIZE;
	ctx.head = ctx.tail = 0;
	ctx.clusterofs = 0;
	remaining = inode->i_size;

	while (remaining) {
		const u64 readcount = min_t(u64, remaining,
					    sizeof(ctx.queue) - ctx.tail);

		ret = read(fd, ctx.queue + ctx.tail, readcount);
		if (ret != readcount) {
			ret = -errno;
			goto err_bdrop;
		}
		remaining -= readcount;
		ctx.tail += readcount;

		/* do one compress round */
		ret = vle_compress_one(inode, &ctx, false);
		if (ret)
			goto err_bdrop;
	}

	/* do the final round */
	ret = vle_compress_one(inode, &ctx, true);
	if (ret)
		goto err_bdrop;

	/* fall back to no compression mode */
	compressed_blocks = ctx.blkaddr - blkaddr;
	if (compressed_blocks >= BLK_ROUND_UP(inode->i_size)) {
		ret = -ENOSPC; //tj: 
		goto err_bdrop;
	}
	...
	erofs_info("compressed %s (%llu bytes) into %u blocks",
		   inode->i_srcpath, (unsigned long long)inode->i_size,
		   compressed_blocks);

整体逻辑就是压缩，压缩，到最后一次 via vle_compress_one()。到最后区分压缩类型：

if (cfg.c_legacy_compress) {
	inode->extent_isize = legacymetasize;
	inode->datalayout = EROFS_INODE_FLAT_COMPRESSION_LEGACY; //tj: 老旧压缩
} else {
	ret = z_erofs_convert_to_compacted_format(inode, blkaddr - 1,
						  legacymetasize, 12);
	DBG_BUGON(ret);
}

再看非压缩：

static int write_uncompressed_file_from_fd(struct erofs_inode *inode, int fd)
{
	int ret;
	unsigned int nblocks, i;

	inode->datalayout = EROFS_INODE_FLAT_INLINE;
	nblocks = inode->i_size / EROFS_BLKSIZ;

	ret = __allocate_inode_bh_data(inode, nblocks);
	if (ret)
		return ret;

	for (i = 0; i < nblocks; ++i) {
		char buf[EROFS_BLKSIZ];

		ret = read(fd, buf, EROFS_BLKSIZ);
		if (ret != EROFS_BLKSIZ) {
			if (ret < 0)
				return -errno;
			return -EAGAIN;
		}

		ret = blk_write(buf, inode->u.i_blkaddr + i, 1);
		if (ret)
			return ret;
	}

同样，先通过__allocate_inode_bh_data()分配data buffer得到i_blkaddr，然后写入img。Then, check尾端数据:

/* read the tail-end data */
inode->idata_size = inode->i_size % EROFS_BLKSIZ;
if (inode->idata_size) {
	inode->idata = malloc(inode->idata_size);
	if (!inode->idata)
		return -ENOMEM;

	ret = read(fd, inode->idata, inode->idata_size);
	if (ret < inode->idata_size) {
		free(inode->idata);
		inode->idata = NULL;
		return -EIO;
	}
}

如果有tail-end data，那么读出来到inode->idata供后面使用。

ok，接下来会call erofs_prepare_inode_buffer():

int erofs_prepare_inode_buffer(struct erofs_inode *inode)
{
	unsigned int inodesize;
	struct erofs_buffer_head *bh, *ibh;

	DBG_BUGON(inode->bh || inode->bh_inline);

	...

	if (is_inode_layout_compression(inode))
		goto noinline; //tj: 压缩不会出现inline了

	/*
	 * if the file size is block-aligned for uncompressed files,
	 * should use EROFS_INODE_FLAT_PLAIN data mapping mode.
	 */
	if (!inode->idata_size)
		inode->datalayout = EROFS_INODE_FLAT_PLAIN;

	bh = erofs_balloc(INODE, inodesize, 0, inode->idata_size);
	if (bh == ERR_PTR(-ENOSPC)) {
		int ret;

		inode->datalayout = EROFS_INODE_FLAT_PLAIN;
noinline:
		/* expend an extra block for tail-end data */
		ret = erofs_prepare_tail_block(inode);
		if (ret)
			return ret;
		bh = erofs_balloc(INODE, inodesize, 0, 0);
		if (IS_ERR(bh))
			return PTR_ERR(bh);
		DBG_BUGON(inode->bh_inline);
	} else if (IS_ERR(bh)) {
		return PTR_ERR(bh);
	} else if (inode->idata_size) {
		inode->datalayout = EROFS_INODE_FLAT_INLINE;

		/* allocate inline buffer */
		ibh = erofs_battach(bh, META, inode->idata_size);
		if (IS_ERR(ibh))
			return PTR_ERR(ibh);

		ibh->op = &erofs_skip_write_bhops;
		inode->bh_inline = ibh; //tj: here
	}

	bh->fsprivate = erofs_igrab(inode);
	bh->op = &erofs_write_inode_bhops;
	inode->bh = bh;
	return 0;
}

这里主要在分配buffer供inline tail-end data用。如果没有空间via erofs_balloc()，那么就扩展一个block。inline data用->bh_inline来标识。

再来看接下来的erofs_write_tail_end():

int erofs_write_tail_end(struct erofs_inode *inode)
{
	struct erofs_buffer_head *bh, *ibh;

	bh = inode->bh_data;

	if (!inode->idata_size)
		goto out;

这是对齐case，no tail。看tail处理：

/* have enough room to inline data */
if (inode->bh_inline) {
	ibh = inode->bh_inline;

	ibh->fsprivate = erofs_igrab(inode);
	ibh->op = &erofs_write_inline_bhops;
} else {

这是inline data，就赋给了一个op操作为erofs_write_inline_bhops()。

static struct erofs_bhops erofs_write_inline_bhops = {
	.flush = erofs_bh_flush_write_inline,
};

non-inline tail处理:

erofs_off_t pos;

erofs_mapbh(bh->block, true);
pos = erofs_btell(bh, true) - EROFS_BLKSIZ;
ret = dev_write(inode->idata, pos, inode->idata_size);
if (ret)
	return ret;
if (inode->idata_size < EROFS_BLKSIZ) {
	ret = dev_fillzero(pos + inode->idata_size,
			   EROFS_BLKSIZ - inode->idata_size,
			   false);
	if (ret)
		return ret;
}

还是通过buffer cache得到了pos，直接写入img。build完成后call erofs_bflush():

int main(int argc, char **argv)
{
	...
	/* flush all remaining buffers */
	if (!erofs_bflush(NULL))
		err = -EIO;
	else

bool erofs_bflush(struct erofs_buffer_block *bb)
{
	...
		list_for_each_entry_safe(bh, nbh, &p->buffers.list, list) {
			/* flush and remove bh */
			if (!bh->op->flush(bh)) //tj:here
				skip = true;
		}

这里的->flush就是erofs_bh_flush_write_inline()：

static bool erofs_bh_flush_write_inline(struct erofs_buffer_head *bh)
{
	struct erofs_inode *const inode = bh->fsprivate;
	const erofs_off_t off = erofs_btell(bh, false);
	int ret;

	ret = dev_write(inode->idata, off, inode->idata_size);
	if (ret)
		return false;

	inode->idata_size = 0;
	free(inode->idata);
	inode->idata = NULL;

	erofs_iput(inode);
	return erofs_bh_flush_generic_end(bh);
}

->idata就是write_uncompressed_file_from_fd()里获取的尾端数据。

另，整个的buffer cache就是用来管理不完整的metadata的，mark to check later.

关于inline data, readme有写到：

[1] According to the erofs on-disk format, the tail block of files
could be inlined aggressively with its metadata in order to reduce
the I/O overhead and save the storage space (called tail-packing).

查了下ext4：

The inline data feature was designed to handle the case that a file’s data is so tiny that it readily fits inside the inode, which (theoretically) reduces disk block consumption and reduces seeks.

就是存储到inode里。

refer doc

https://ext4.wiki.kernel.org/index.php/Ext4_Disk_Layout#Inline_Data