Android O开始默认使用了sdcardfs,弃用了好多年的fuse,原因就是sdcardfs比fuse性能更优,原生Android sdcardfs的由来:vfs > wrapfs(edu) > sdcardfs(Samsung) > sdcardfs(Google)。

sdcardfs有个问题:cp文件到SD卡(fat格式)再删除这个文件,该文件空间未释放。

file system头一回看,从源头先简单了解下vfs的几个重要概念:

  • inode: index node,每个object对应一个inode,当然这个object可以是directory, file等等。
  • dentry: diretory entry,用来管理目录层级,他含有name,inode等。
  • dcache: 就是dentry cache,显然是加速dentry访问。
  • superblock: 代表的是文件系统,比如fs type,total blks,free blocks等。

ok,下来了解下wrapfs:

一般file system有两类:一个是纯内核(native)文件系统比如ext4,一个是用户态和内核态交互的,fuse就是。wrapfs是第三类,是一种null-layer stackable file system,性能接近纯内核文件系统,so这也是Android放弃fuse的原因。

边界分层如下图:

wrapfs-border.png

可以看出,wrapfs即充当vfs同时也充当native fs。

ok, 看下mmc1/SD卡(vfat)删除的流程,msm/kernel 3.18:

  • rm flow: only call vfat_unlink, 空间释放ok。
  • delete by apk: call sdcardfs_lookup then vfat_unlink, 空间释放err。

再来看下mmc0/data分区(f2fs)的删除流程:

  • delete by apk: call sdcardfs_unlink then f2fs_unlink, 空间释ok。
  • rm: 和apk一样,先走sdcardfs_unlink再f2fs_unlink,空间释放ok。

那就有一个疑问:同样的rm unlink系统调用,SD卡(vfat)删除时为什么不走sdcardfs_unlink?mark to check later。

rm unlink flow on data/sdcardfs on f2fs:

[ 1509.351490] [<c034ef3c>] (f2fs_unlink) from [<c023b7f4>] (vfs_unlink2+0xe8/0x17c)
[ 1509.351508] [<c023b7f4>] (vfs_unlink2) from [<c02fd798>] (sdcardfs_unlink+0xc8/0x198)
[ 1509.351526] [<c02fd798>] (sdcardfs_unlink) from [<c023b7f4>] (vfs_unlink2+0xe8/0x17c)
[ 1509.351544] [<c023b7f4>] (vfs_unlink2) from [<c023ff4c>] (do_unlinkat+0xd4/0x1c4)
[ 1509.351561] [<c023ff4c>] (do_unlinkat) from [<c0106aa0>] (ret_fast_syscall+0x0/0x44)

rm unlink flow on SD/sdcardfs on vfat:

[ 1489.624809] [<c02fa5f4>] (vfat_unlink) from [<c023b7f4>] (vfs_unlink2+0xe8/0x17c)
[ 1489.624829] [<c023b7f4>] (vfs_unlink2) from [<c023ff4c>] (do_unlinkat+0xd4/0x1c4)
[ 1489.624847] [<c023ff4c>] (do_unlinkat) from [<c0106aa0>] (ret_fast_syscall+0x0/0x44)

so lets check do_unlinkat why not to free space:

static long do_unlinkat(int dfd, const char __user *pathname)
{
...
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
                if (nd.last.name[nd.last.len])
                        goto slashes;
                inode = dentry->d_inode;
                if (d_is_negative(dentry))
                        goto slashes;
                ihold(inode);
                error = security_path_unlink(&nd.path, dentry);
                if (error)
                        goto exit2;
                error = vfs_unlink2(nd.path.mnt, nd.path.dentry->d_inode, dentry, &delegated_inode);
exit2:
                dput(dentry);
        }    
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        if (inode)
                iput(inode);    /* truncate the inode here */ //tj: 这里是关键

input:

/**
 *    iput    - put an inode
 *    @inode: inode to put
 *
 *    Puts an inode, dropping its usage count. If the inode use count hits
 *    zero, the inode is then freed and may also be destroyed.
 *
 *    Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
    if (!inode)
        return;
    BUG_ON(inode->i_state & I_CLEAR);
retry:
    if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
        if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
            atomic_inc(&inode->i_count);
            inode->i_state &= ~I_DIRTY_TIME;
            spin_unlock(&inode->i_lock);
            trace_writeback_lazytime_iput(inode);
            mark_inode_dirty_sync(inode);
            goto retry;
        }
        iput_final(inode); //tj: 最终要走这里
    }
}

iput_final:

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
    struct super_block *sb = inode->i_sb;
    const struct super_operations *op = inode->i_sb->s_op;
    int drop;

    WARN_ON(inode->i_state & I_NEW);

    if (op->drop_inode)
        drop = op->drop_inode(inode);
    else
        drop = generic_drop_inode(inode);

    if (!drop && (sb->s_flags & MS_ACTIVE)) {
        inode->i_state |= I_REFERENCED;
        inode_add_lru(inode);
        spin_unlock(&inode->i_lock);
        return;
    }

    if (!drop) {
        inode->i_state |= I_WILL_FREE;
        spin_unlock(&inode->i_lock);
        write_inode_now(inode, 1);
        spin_lock(&inode->i_lock);
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state &= ~I_WILL_FREE;
    }

    inode->i_state |= I_FREEING;
    if (!list_empty(&inode->i_lru))
        inode_lru_list_del(inode);
    spin_unlock(&inode->i_lock);

    evict(inode); // tj: 这里会释放空间
}

evict:

static void evict(struct inode *inode)
{
    const struct super_operations *op = inode->i_sb->s_op;

    BUG_ON(!(inode->i_state & I_FREEING));
    BUG_ON(!list_empty(&inode->i_lru));

    if (!list_empty(&inode->i_wb_list))
        inode_wb_list_del(inode);

    inode_sb_list_del(inode);

    /*
     * Wait for flusher thread to be done with the inode so that filesystem
     * does not start destroying it while writeback is still running. Since
     * the inode has I_FREEING set, flusher thread won't start new work on
     * the inode.  We just have to wait for running writeback to finish.
     */
    inode_wait_for_writeback(inode);

    if (op->evict_inode) {
        op->evict_inode(inode); // tj: call fat_evict_inode 

fat_evict_inode:

static void fat_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0; 
                fat_truncate_blocks(inode, 0); // tj: 这里释放
        }    
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        fat_cache_inval_inode(inode);
        fat_detach(inode);
}

fat_truncate_blocks:

void fat_truncate_blocks(struct inode *inode, loff_t offset)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
        int nr_clusters;

        /*  
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
        if (MSDOS_I(inode)->mmu_private > offset)
                MSDOS_I(inode)->mmu_private = offset;

        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;

        fat_free(inode, nr_clusters);  // tj: got here...
        fat_flush_inodes(inode->i_sb, inode, NULL);
}

so, 删除文件的流程:

do_unlinkat -> input -> input_final -> evict -> fat_evict_inode

可以看出,evict是有条件触发的:

    if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {

ok, 就是说i_count必须是0 vfat才能truncate block, 加了调试信息果然发现input进入i_count是2,显然就没走evict。

so, what is i_count of an inode? 就是inode的使用计数,怎么理解?i_count被封在__iget里,可以看到很多接口会call __iget。还有一个计数是i_nlink, 是inode的hard link数目,只有两者都是0才释放该空间。

那sdcardfs lookup哪里使用了i_count?

sdcardfs_lookup > __sdcardfs_lookup > __sdcardfs_interpose > sdcardfs_iget > igrab > __iget > atomic_inc(&inode->i_count);

sdcardfs lookup一直使用该inode导致在接下来的vfat unlink不能进入evict。

参考文档