参考staging erofs,on-disk inode layout相关实现是erofs_fs.h + super.c + inode.c。

data mappings

先看文档介绍:

- Metadata & data could be mixed by design;

Different from other read-only file systems, an EROFS volume is designed to be as simple as possible:

                              |-> aligned with the block size 
_____________________________________________________________
| |SB| | ... | Metadata | ... | Data | Metadata | ... | Data |
|_|__|_|_____|__________|_____|______|__________|_____|______|
0 +1K

What is metadata in fs? refer below from wikipedia:

metadata is data information that provides infor about other data.

erofs super block放到了1K偏移处。superblock_read():

#define EROFS_SUPER_OFFSET      1024
static int superblock_read(struct super_block *sb)
{
struct erofs_sb_info *sbi;
struct buffer_head *bh;
struct erofs_super_block *layout;
unsigned int blkszbits;
int ret;

bh = sb_bread(sb, 0);

if (!bh) {
errln("cannot read erofs superblock");
return -EIO;
}

sbi = EROFS_SB(sb);
layout = (struct erofs_super_block *)((u8 *)bh->b_data
+ EROFS_SUPER_OFFSET); // tj: here

All data areas should be aligned with the block size, but metadata areas may not. All metadatas can be now observed in two different spaces (views):

  1. Inode metadata space
    Each valid inode should be aligned with an inode slot, which is a fixed value (32 bytes) and designed to be kept in line with v1 inode size.

    Each inode can be directly found with the following formula:
    \ \ \ \ \ inode offset = meta_blkaddr * block_size + 32 * nid

                            |-> aligned with 8B 
|-> followed closely
+ meta_blkaddr blocks |-> another slot
_____________________________________________________________________
| ... | inode | xattrs | extents | data inline | ... | inode ...
|________|_______|(optional)|(optional)|__(optional)_|_____|__________
|-> aligned with the inode slot size

Xattrs, extents, data inline are followed by the corresponding inode with proper alignes, and they could be optional for different data mappings, currently there are totally 3 valid data mappings supported:
1) flat file data without data inline (no extent);
2) fixed-output size data compression (must have extents);
3) flat file data with tail-end data inline (no extent);

三种inode data mapping:

/*
* erofs inode data mapping:
* 0 - inode plain without inline data A:
* inode, [xattrs], ... | ... | no-holed data
* 1 - inode VLE compression B:
* inode, [xattrs], extents ... | ...
* 2 - inode plain with inline data C:
* inode, [xattrs], last_inline_data, ... | ... | no-holed data
* 3~7 - reserved
*/
enum {
EROFS_INODE_LAYOUT_PLAIN,
EROFS_INODE_LAYOUT_COMPRESSION,
EROFS_INODE_LAYOUT_INLINE,
EROFS_INODE_LAYOUT_MAX
};

read super block

erofs_mount() > erofs_fill_super() > erofs_read_super()

mount时会call erofs_read_super()

static int erofs_read_super(struct super_block *sb,
const char *dev_name,
void *data, int silent)
{
struct inode *inode;
struct erofs_sb_info *sbi;
sb->s_fs_info = sbi;
...
err = superblock_read(sb);
if (err)
goto err_sbread;

superblock_read()就是find super block,super block结构赋值来源mkfs.erofs.

接下来在解析mount选项后会获取root inode给->s_root:

/* get the root inode */
inode = erofs_iget(sb, ROOT_NID(sbi), true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto err_iget;
}

if (!S_ISDIR(inode->i_mode)) {
errln("rootino(nid %llu) is not a directory(i_mode %o)",
ROOT_NID(sbi), inode->i_mode);
err = -EINVAL;
iput(inode);
goto err_iget;
}

sb->s_root = d_make_root(inode); //tj: here
if (!sb->s_root) {
err = -ENOMEM;
goto err_iget;
}

如果root inode不是目录,那就无效了。

get inode

主要看下erofs_iget():

struct inode *erofs_iget(struct super_block *sb,
erofs_nid_t nid,
bool isdir)
{
struct inode *inode = erofs_iget_locked(sb, nid);

if (unlikely(!inode))
return ERR_PTR(-ENOMEM);

if (inode->i_state & I_NEW) {
int err;
struct erofs_vnode *vi = EROFS_V(inode);

vi->nid = nid;

err = fill_inode(inode, isdir);
if (likely(!err))
unlock_new_inode(inode);
else {
iget_failed(inode);
inode = ERR_PTR(err);
}
}
return inode;
}

先是call erofs_iget_locked()获取,如果没有get到(几率小)就出错了,如果是I_NEW状态,那就生成一个新inode via fill_inode()

64bits及以上平台用iget_locked(),其他用iget5_locked(),两个vfs接口,not care now,lets check fill_inode():

static int fill_inode(struct inode *inode, int isdir)
{
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
struct erofs_vnode *vi = EROFS_V(inode);
struct page *page;
void *data;
int err;
erofs_blk_t blkaddr;
unsigned int ofs;

trace_erofs_fill_inode(inode, isdir);

blkaddr = erofs_blknr(iloc(sbi, vi->nid));
ofs = erofs_blkoff(iloc(sbi, vi->nid));
typedef u64 erofs_off_t;

/* data type for filesystem-wide blocks number */
typedef u32 erofs_blk_t;

#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ)
#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ)

static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
{
return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
}

按inode metadata view来,->meta_blkaddr是block number? iloc()负责找到inode address。

接下来会get inode page, mark to check later.

page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir);

if (IS_ERR(page)) {
errln("failed to get inode (nid: %llu) page, err %ld",
vi->nid, PTR_ERR(page));
return PTR_ERR(page);
}
...
data = page_address(page);

err = read_inode(inode, data + ofs);

read_inode()会读data(来自inode page)的->i_advise走inode v1 or v2。没有错误会check是否是inline data。

static int fill_inline_data(struct inode *inode, void *data,
unsigned int m_pofs)
{
struct erofs_vnode *vi = EROFS_V(inode);
struct erofs_sb_info *sbi = EROFS_I_SB(inode);
const int mode = vi->datamode;
...
/* fast symlink (following ext4) */
if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) {
...

fast symlink看条件就是比较小的inode, rt? inline data是for fast symlink? let’s check mkfs.erofs:

int mkfs_relocate_sub_inodes(struct erofs_vnode *inode)
{
...
case EROFS_FT_DIR:
case EROFS_FT_SYMLINK:
unaligned = d->i_size % EROFS_BLKSIZE;
nblocks = d->i_size / EROFS_BLKSIZE;

if (unaligned > erofs_calc_inline_data_size(d) ||
(unaligned == 0 && nblocks != 0)) {
d->i_dmode = EROFS_INODE_LAYOUT_PLAIN;
mkfs_rank_inode(d);

if (unaligned != 0)
nblocks++;
blkaddr = erofs_alloc_blocks(nblocks);
if (!blkaddr)
return -ENOSPC;

d->i_blkaddr = blkaddr;
} else {
d->i_dmode = EROFS_INODE_LAYOUT_INLINE;
d->i_inline_datalen = unaligned;
mkfs_rank_inode(d);

if (nblocks > 0) {
blkaddr = erofs_alloc_blocks(nblocks);
if (!blkaddr)
return -ENOSPC;

d->i_blkaddr = blkaddr;
} else {
d->i_blkaddr = 0;
}
}
break;

EROFS_FT_DIR type的貌似也可以吧。

Done.