Read squashfs readpage() code | TJ的技术博客

内核参考5.x:

const struct address_space_operations squashfs_aops = {
        .readpage = squashfs_readpage
};

static int squashfs_readpage(struct file *file, struct page *page)
{
...
                if (bsize == 0)
                        res = squashfs_readpage_sparse(page, expected);
                else
                        res = squashfs_readpage_block(page, block, bsize, expected); //tj: note page
        } else
                res = squashfs_readpage_fragment(page, expected);

        if (!res)
                return 0;
...

我们只看squashfs_readpage_block():

./file_direct.c:25:int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
./file_cache.c:21:int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expected)

./Makefile:10:squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
./Makefile:9:squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o

choice
        prompt "File decompression options"
        depends on SQUASHFS
        help
          Squashfs now supports two options for decompressing file
          data.  Traditionally Squashfs has decompressed into an
          intermediate buffer and then memcopied it into the page cache.
          Squashfs now supports the ability to decompress directly into
          the page cache.

          If unsure, select "Decompress file data into an intermediate buffer"

config SQUASHFS_FILE_CACHE
        bool "Decompress file data into an intermediate buffer"
        help
          Decompress file data into an intermediate buffer and then
          memcopy it into the page cache.

config SQUASHFS_FILE_DIRECT
        bool "Decompress files directly into the page cache"
        help
          Directly decompress file data into the page cache.
          Doing so can significantly improve performance because
          it eliminates a memcpy and it also removes the lock contention
          on the single buffer.

endchoice

注释很清晰，我们先看FILE_CACHE(file_cache.c), 关注如何cp到page cache：

/* Read separately compressed datablock and memcopy into page cache */
int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expected)
{
        struct inode *i = page->mapping->host;
        struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
                block, bsize);
        int res = buffer->error;

        if (res)
                ERROR("Unable to read page, block %llx, size %x\n", block,
                        bsize);
        else
                squashfs_copy_cache(page, buffer, expected, 0); //tj: note page

        squashfs_cache_put(buffer);
        return res;
}

临时中间buffer:

struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
                                u64 start_block, int length)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;

        return squashfs_cache_get(sb, msblk->read_page, start_block, length);
}

/*
 * Look-up block in cache, and increment usage count.  If not in cache, read
 * and decompress it from disk.
 */
struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
        struct squashfs_cache *cache, u64 block, int length)
{

这里的cache是啥？我们来看：

struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
        struct squashfs_cache *cache, u64 block, int length)
{
        int i, n;
        struct squashfs_cache_entry *entry;

        spin_lock(&cache->lock);

        while (1) {
                for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
                        if (cache->entry[i].block == block) {
                                cache->curr_blk = i;
                                break;
                        }
                        i = (i + 1) % cache->entries;
                }

                if (n == cache->entries) {
                        /*
                         * Block not in cache, if all cache entries are used
                         * go to sleep waiting for one to become available.
                         */

在入参cache里查找是否有block, 如果没有会从disk读出来：

/*
 * Initialise chosen cache entry, and fill it in from
 * disk.
 */
cache->unused--;
entry->block = block;
entry->refcount = 1;
entry->pending = 1;
entry->num_waiters = 0;
entry->error = 0;
spin_unlock(&cache->lock);

entry->length = squashfs_read_data(sb, block, length,
        &entry->next_index, entry->actor);  //tj: here, note that ->actor

spin_lock(&cache->lock);

int squashfs_read_data(struct super_block *sb, u64 index, int length,
                       u64 *next_index, struct squashfs_page_actor *output)
{
...
        res = squashfs_bio_read(sb, index, length, &bio, &offset); //tj: rd from disk
        if (res)
                goto out;

        if (compressed) {
                if (!msblk->stream) {
                        res = -EIO;
                        goto out_free_bio;
                }
                res = squashfs_decompress(msblk, bio, offset, length, output);
        } else {
                res = copy_bio_to_actor(bio, output, offset, length);
        }
...

我们看下解压到哪里了, check lz4:

static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
        struct bio *bio, int offset, int length,
        struct squashfs_page_actor *output)
{
        struct bvec_iter_all iter_all = {};
        struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
        struct squashfs_lz4 *stream = strm;
        void *buff = stream->input, *data;
        int bytes = length, res;

	//tj: get ->input
        while (bio_next_segment(bio, &iter_all)) {
                int avail = min(bytes, ((int)bvec->bv_len) - offset);

                data = page_address(bvec->bv_page) + bvec->bv_offset;
                memcpy(buff, data + offset, avail);
                buff += avail;
                bytes -= avail;
                offset = 0;
        }

        res = LZ4_decompress_safe(stream->input, stream->output,
                length, output->length);

        if (res < 0)
                return -EIO;

        bytes = res;
        data = squashfs_first_page(output);
        buff = stream->output;
        while (data) {
                if (bytes <= PAGE_SIZE) {
                        memcpy(data, buff, bytes);
                        break;
                }
                memcpy(data, buff, PAGE_SIZE);
                buff += PAGE_SIZE;
                bytes -= PAGE_SIZE;
                data = squashfs_next_page(output);
        }
        squashfs_finish_page(output);

        return res;
}

调用解压前，原始->input来自bio，具体不跟了。来看下解压完后:

static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
{
        actor->next_page = 1;
        return actor->page[0];
}

static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
{
        return actor->next_page == actor->pages ? NULL :
                actor->page[actor->next_page++];
}

解压后的数据都放到了actor->page。ok, 看下如何cp的page cache:

/* Copy data into page cache  */
void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
        int bytes, int offset)
{
        struct inode *inode = page->mapping->host;
        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
        int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
        int start_index = page->index & ~mask, end_index = start_index | mask;

        /*
         * Loop copying datablock into pages.  As the datablock likely covers
         * many PAGE_SIZE pages (default block size is 128 KiB) explicitly
         * grab the pages from the page cache, except for the page that we've
         * been called to fill.
         */
        for (i = start_index; i <= end_index && bytes > 0; i++,
                        bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
                struct page *push_page;
                int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0;

                TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);

                push_page = (i == page->index) ? page :
                        grab_cache_page_nowait(page->mapping, i);

                if (!push_page)
                        continue;

                if (PageUptodate(push_page))
                        goto skip_page;

                squashfs_fill_page(push_page, buffer, offset, avail);
skip_page:
                unlock_page(push_page);
                if (i != page->index)
                        put_page(push_page);
        }
}

从page cache里获取page:

static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

5.x引入了folio，不知道啥，我们看下4.14上的：

/**
 * pagecache_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 * @fgp_flags: PCG flags
 * @gfp_mask: gfp mask to use for the page cache data page allocation
 *
 * Looks up the page cache slot at @mapping & @offset.
 *
 * PCG flags modify how the page is returned.
 *
 * @fgp_flags can be:
 *
 * - FGP_ACCESSED: the page will be marked accessed
 * - FGP_LOCK: Page is return locked
 * - FGP_CREAT: If page is not present then a new page is allocated using
 *   @gfp_mask and added to the page cache and the VM's LRU
 *   list. The page is returned locked and with an increased
 *   refcount.
 * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
 *   its own locking dance if the page is already in cache, or unlock the page
 *   before returning if we had to add the page to pagecache.
 *
 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
 * if the GFP flags specified for FGP_CREAT are atomic.
 *
 * If there is a page cache page, it is returned with an increased refcount.
 */
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
	int fgp_flags, gfp_t gfp_mask)
{

啊，在page cache中查找某个slot, 这个slot用@mapping & @offset唯一标识。

而真正的cp在squashfs_fill_page():

void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer, int offset, int avail)
{
        int copied;
        void *pageaddr;

        pageaddr = kmap_atomic(page); //tj: virtual page of this page
        copied = squashfs_copy_data(pageaddr, buffer, offset, avail); //tj: here
        memset(pageaddr + copied, 0, PAGE_SIZE - copied);
        kunmap_atomic(pageaddr);

        flush_dcache_page(page);
        if (copied == avail)
                SetPageUptodate(page);
        else
                SetPageError(page);
}

ok, 为了性能，来看file_direct.c：

/* Read separately compressed datablock directly into page cache */
int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
        int expected)

首先创建一个page actor:

/*
 * Create a "page actor" which will kmap and kunmap the
 * page cache pages appropriately within the decompressor
 */
actor = squashfs_page_actor_init_special(page, pages, 0);
if (actor == NULL)
        goto out;

struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
        int pages, int length)
{
        struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);

        if (actor == NULL)
                return NULL;

        actor->length = length ? : pages * PAGE_SIZE;
        actor->page = page; //tj: note
        actor->pages = pages;
        actor->next_page = 0;
        actor->pageaddr = NULL;

接下来和file_cache.c一样，从page cache获取page：

/* Try to grab all the pages covered by the Squashfs block */
for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
        page[i] = (n == target_page->index) ? target_page :
                grab_cache_page_nowait(target_page->mapping, n);

        if (page[i] == NULL) {
                missing_pages++;
                continue;
        }

        if (PageUptodate(page[i])) {
                unlock_page(page[i]);
                put_page(page[i]);
                page[i] = NULL;
                missing_pages++;
        }
}

如果获取不到，fall back到临时的buffer like file_cache.c:

if (missing_pages) {
        /*
         * Couldn't get one or more pages, this page has either
         * been VM reclaimed, but others are still in the page cache
         * and uptodate, or we're racing with another thread in
         * squashfs_readpage also trying to grab them.  Fall back to
         * using an intermediate buffer.
         */
        res = squashfs_read_cache(target_page, block, bsize, pages,
                                                page, expected);
        if (res < 0)
                goto mark_errored;

        goto out;
}

static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
        int pages, struct page **page, int bytes)
{
        struct inode *i = target_page->mapping->host;
        struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
                                                 block, bsize);
        int res = buffer->error, n, offset = 0;

        if (res) {
                ERROR("Unable to read page, block %llx, size %x\n", block,
                        bsize);
                goto out;
        }

        for (n = 0; n < pages && bytes > 0; n++,
                        bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
                int avail = min_t(int, bytes, PAGE_SIZE);

                if (page[n] == NULL)
                        continue;

                squashfs_fill_page(page[n], buffer, offset, avail); //tj: here
                unlock_page(page[n]);
                if (page[n] != target_page)
                        put_page(page[n]);
        }

out:
        squashfs_cache_put(buffer);
        return res;
}

如果能获取到，直接解压到page cache buffers:

/* Decompress directly into the page cache buffers */
res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);

前文已经分析过了，最终解压到了actor->page, 来源一开始：

int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
        int expected)

{
...
        int start_index = target_page->index & ~mask;
        int end_index = start_index | mask;
...
        pages = end_index - start_index + 1;

        page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
        if (page == NULL)
                return res;

最终处理：

/* Mark pages as uptodate, unlock and release */
for (i = 0; i < pages; i++) {
        flush_dcache_page(page[i]);
        SetPageUptodate(page[i]);
        unlock_page(page[i]);
        if (page[i] != target_page) //tj: 不释放target_page
                put_page(page[i]);
}