SDCardFS:删除文件后空间未释放问题分析

Android O开始默认使用了sdcardfs，弃用了好多年的fuse，原因就是sdcardfs比fuse性能更优，原生Android sdcardfs的由来：vfs > wrapfs(edu) > sdcardfs(Samsung) > sdcardfs(Google)。

sdcardfs有个问题：cp文件到SD卡(fat格式)再删除这个文件，该文件空间未释放。

file system头一回看，从源头先简单了解下vfs的几个重要概念：

inode: index node，每个object对应一个inode，当然这个object可以是directory, file等等。
dentry: diretory entry，用来管理目录层级，他含有name，inode等。
dcache: 就是dentry cache，显然是加速dentry访问。
superblock: 代表的是文件系统，比如fs type，total blks，free blocks等。

ok，下来了解下wrapfs：

一般file system有两类：一个是纯内核(native)文件系统比如ext4，一个是用户态和内核态交互的，fuse就是。wrapfs是第三类，是一种null-layer stackable file system，性能接近纯内核文件系统，so这也是Android放弃fuse的原因。

边界分层如下图：

可以看出，wrapfs即充当vfs同时也充当native fs。

ok, 看下mmc1/SD卡(vfat)删除的流程，msm/kernel 3.18：

rm flow: only call vfat_unlink, 空间释放ok。
delete by apk: call sdcardfs_lookup then vfat_unlink, 空间释放err。

再来看下mmc0/data分区(f2fs)的删除流程：

delete by apk: call sdcardfs_unlink then f2fs_unlink, 空间释ok。
rm: 和apk一样，先走sdcardfs_unlink再f2fs_unlink，空间释放ok。

那就有一个疑问：同样的rm unlink系统调用，SD卡(vfat)删除时为什么不走sdcardfs_unlink？mark to check later。

rm unlink flow on data/sdcardfs on f2fs：

[ 1509.351490] [<c034ef3c>] (f2fs_unlink) from [<c023b7f4>] (vfs_unlink2+0xe8/0x17c)
[ 1509.351508] [<c023b7f4>] (vfs_unlink2) from [<c02fd798>] (sdcardfs_unlink+0xc8/0x198)
[ 1509.351526] [<c02fd798>] (sdcardfs_unlink) from [<c023b7f4>] (vfs_unlink2+0xe8/0x17c)
[ 1509.351544] [<c023b7f4>] (vfs_unlink2) from [<c023ff4c>] (do_unlinkat+0xd4/0x1c4)
[ 1509.351561] [<c023ff4c>] (do_unlinkat) from [<c0106aa0>] (ret_fast_syscall+0x0/0x44)

rm unlink flow on SD/sdcardfs on vfat:

[ 1489.624809] [<c02fa5f4>] (vfat_unlink) from [<c023b7f4>] (vfs_unlink2+0xe8/0x17c)
[ 1489.624829] [<c023b7f4>] (vfs_unlink2) from [<c023ff4c>] (do_unlinkat+0xd4/0x1c4)
[ 1489.624847] [<c023ff4c>] (do_unlinkat) from [<c0106aa0>] (ret_fast_syscall+0x0/0x44)

so lets check do_unlinkat why not to free space:

static long do_unlinkat(int dfd, const char __user *pathname)
{
...
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
                if (nd.last.name[nd.last.len])
                        goto slashes;
                inode = dentry->d_inode;
                if (d_is_negative(dentry))
                        goto slashes;
                ihold(inode);
                error = security_path_unlink(&nd.path, dentry);
                if (error)
                        goto exit2;
                error = vfs_unlink2(nd.path.mnt, nd.path.dentry->d_inode, dentry, &delegated_inode);
exit2:
                dput(dentry);
        }    
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        if (inode)
                iput(inode);    /* truncate the inode here */ //tj: 这里是关键

input:

/**
 *	iput	- put an inode
 *	@inode: inode to put
 *
 *	Puts an inode, dropping its usage count. If the inode use count hits
 *	zero, the inode is then freed and may also be destroyed.
 *
 *	Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
	if (!inode)
		return;
	BUG_ON(inode->i_state & I_CLEAR);
retry:
	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
			atomic_inc(&inode->i_count);
			inode->i_state &= ~I_DIRTY_TIME;
			spin_unlock(&inode->i_lock);
			trace_writeback_lazytime_iput(inode);
			mark_inode_dirty_sync(inode);
			goto retry;
		}
		iput_final(inode); //tj: 最终要走这里
	}
}

iput_final:

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	const struct super_operations *op = inode->i_sb->s_op;
	int drop;

	WARN_ON(inode->i_state & I_NEW);

	if (op->drop_inode)
		drop = op->drop_inode(inode);
	else
		drop = generic_drop_inode(inode);

	if (!drop && (sb->s_flags & MS_ACTIVE)) {
		inode->i_state |= I_REFERENCED;
		inode_add_lru(inode);
		spin_unlock(&inode->i_lock);
		return;
	}

	if (!drop) {
		inode->i_state |= I_WILL_FREE;
		spin_unlock(&inode->i_lock);
		write_inode_now(inode, 1);
		spin_lock(&inode->i_lock);
		WARN_ON(inode->i_state & I_NEW);
		inode->i_state &= ~I_WILL_FREE;
	}

	inode->i_state |= I_FREEING;
	if (!list_empty(&inode->i_lru))
		inode_lru_list_del(inode);
	spin_unlock(&inode->i_lock);

	evict(inode); // tj: 这里会释放空间
}

evict:

static void evict(struct inode *inode)
{
	const struct super_operations *op = inode->i_sb->s_op;

	BUG_ON(!(inode->i_state & I_FREEING));
	BUG_ON(!list_empty(&inode->i_lru));

	if (!list_empty(&inode->i_wb_list))
		inode_wb_list_del(inode);

	inode_sb_list_del(inode);

	/*
	 * Wait for flusher thread to be done with the inode so that filesystem
	 * does not start destroying it while writeback is still running. Since
	 * the inode has I_FREEING set, flusher thread won't start new work on
	 * the inode.  We just have to wait for running writeback to finish.
	 */
	inode_wait_for_writeback(inode);

	if (op->evict_inode) {
		op->evict_inode(inode); // tj: call fat_evict_inode

fat_evict_inode:

static void fat_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0; 
                fat_truncate_blocks(inode, 0); // tj: 这里释放
        }    
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        fat_cache_inval_inode(inode);
        fat_detach(inode);
}

fat_truncate_blocks:

void fat_truncate_blocks(struct inode *inode, loff_t offset)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
        int nr_clusters;

        /*  
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
        if (MSDOS_I(inode)->mmu_private > offset)
                MSDOS_I(inode)->mmu_private = offset;

        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;

        fat_free(inode, nr_clusters);  // tj: got here...
        fat_flush_inodes(inode->i_sb, inode, NULL);
}

so, 删除文件的流程：

do_unlinkat -> input -> input_final -> evict -> fat_evict_inode

可以看出，evict是有条件触发的:

if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {

ok, 就是说i_count必须是0 vfat才能truncate block, 加了调试信息果然发现input进入i_count是2，显然就没走evict。

so, what is i_count of an inode? 就是inode的使用计数，怎么理解？i_count被封在__iget里，可以看到很多接口会call __iget。还有一个计数是i_nlink, 是inode的hard link数目，只有两者都是0才释放该空间。

那sdcardfs lookup哪里使用了i_count?

sdcardfs_lookup > __sdcardfs_lookup > __sdcardfs_interpose > sdcardfs_iget > igrab > __iget > atomic_inc(&inode->i_count);

sdcardfs lookup一直使用该inode导致在接下来的vfat unlink不能进入evict。

SDCardFS:删除文件后空间未释放问题分析

SDCardFS:删除文件后空间未释放问题分析

参考文档