Read vfs_read() code | TJ的技术博客

主要关注page cache部分，参考Linux Kernel 4.14:

407 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
408                    loff_t *pos)
409 {
410         if (file->f_op->read)
411                 return file->f_op->read(file, buf, count, pos);
412         else if (file->f_op->read_iter)
413                 return new_sync_read(file, buf, count, pos);
414         else
415                 return -EINVAL;
416 }

->read和->read_iter的接口定义：

1699 struct file_operations {
1702         ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1703         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1704         ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
1705         ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);

来看new_sync_read()：

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
        struct iovec iov = { .iov_base = buf, .iov_len = len };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        iov_iter_init(&iter, READ, &iov, 1, len);

        ret = call_read_iter(filp, &kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        *ppos = kiocb.ki_pos;
        return ret;
}

1769 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
1770                                      struct iov_iter *iter)
1771 {
1772         return file->f_op->read_iter(kio, iter);
1773 }

在vfs_open()时会把inode关联到file。

 858 struct file {
...
 863         struct path             f_path;
 864         struct inode            *f_inode;       /* cached value */
 865         const struct file_operations    *f_op;
...
 894         struct address_space    *f_mapping;

879 int vfs_open(const struct path *path, struct file *file,
880              const struct cred *cred)
881 {
882         struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
883
884         if (IS_ERR(dentry))
885                 return PTR_ERR(dentry);
886
887         file->f_path = *path;
888         return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
889 }

 714 static int do_dentry_open(struct file *f,
 715                           struct inode *inode,
 716                           int (*open)(struct inode *, struct file *),
 717                           const struct cred *cred)
 718 {
...
 726         f->f_inode = inode;
 727         f->f_mapping = inode->i_mapping; //tj: pagecache
...
 760         f->f_op = fops_get(inode->i_fop); //tj: f_op

2150 #define fops_get(fops) \
2151         (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))

vfs有个通用的generic_file_read_iter()接口，比如f2fs就直接用了：

2795 const struct file_operations f2fs_file_operations = {
2796         .llseek         = f2fs_llseek,
2797         .read_iter      = generic_file_read_iter,

在mm/filemap.c:

2214 /**
2215  * generic_file_read_iter - generic filesystem read routine
2216  * @iocb:       kernel I/O control block
2217  * @iter:       destination for the data read
2218  *
2219  * This is the "read_iter()" routine for all filesystems
2220  * that can use the page cache directly.
2221  */
2222 ssize_t
2223 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2224 {
2225         size_t count = iov_iter_count(iter);
2226         ssize_t retval = 0;
2227
2228         if (!count)
2229                 goto out; /* skip atime */
2230
2231         if (iocb->ki_flags & IOCB_DIRECT) {

IOCB_DIRECT不走page cache。

2272
2273         retval = generic_file_buffered_read(iocb, iter, retval);
2274 out:
2275         return retval;
2276 }

来看 generic_file_buffered_read():

1949 /**
1950  * generic_file_buffered_read - generic file read routine
1951  * @iocb:       the iocb to read
1952  * @iter:       data destination
1953  * @written:    already copied
1954  *
1955  * This is a generic file read routine, and uses the
1956  * mapping->a_ops->readpage() function for the actual low-level stuff.
1957  *
1958  * This is really ugly. But the goto's actually try to clarify some
1959  * of the logic when it comes to error handling etc.
1960  */
1961 static ssize_t generic_file_buffered_read(struct kiocb *iocb,
1962                 struct iov_iter *iter, ssize_t written)
1963 {

函数头注释已经提到了这里会用底层的mapping->a_ops->readpage()。

1980         index = *ppos >> PAGE_SHIFT;
1981         prev_index = ra->prev_pos >> PAGE_SHIFT;
1982         prev_offset = ra->prev_pos & (PAGE_SIZE-1);
1983         last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
1984         offset = *ppos & ~PAGE_MASK;

读范围：index和last_index。

1999                 page = find_get_page(mapping, index);

先find_get_page()在page cache里找这个index偏移的page:

263 /**
264  * find_get_page - find and get a page reference
265  * @mapping: the address_space to search
266  * @offset: the page index
267  *
268  * Looks up the page cache slot at @mapping & @offset.  If there is a
269  * page cache page, it is returned with an increased refcount.
270  *
271  * Otherwise, %NULL is returned.
272  */
273 static inline struct page *find_get_page(struct address_space *mapping,
274                                         pgoff_t offset)
275 {
276         return pagecache_get_page(mapping, offset, 0, 0);
277 }

2000                 if (!page) {
2001                         if (iocb->ki_flags & IOCB_NOWAIT)
2002                                 goto would_block;
2003                         page_cache_sync_readahead(mapping,
2004                                         ra, filp,
2005                                         index, last_index - index);
2006                         page = find_get_page(mapping, index);
2007                         if (unlikely(page == NULL))
2008                                 goto no_cached_page;
2009                 }

如果找不到，就page_cache_sync_readahead( , , , offset, req_size)同步预读下，再找还找不到就走no_cached_page(可能性小)。来看同步读：

489 /**
490  * page_cache_sync_readahead - generic file readahead
491  * @mapping: address_space which holds the pagecache and I/O vectors
492  * @ra: file_ra_state which holds the readahead state
493  * @filp: passed on to ->readpage() and ->readpages()
494  * @offset: start offset into @mapping, in pagecache page-sized units
495  * @req_size: hint: total size of the read which the caller is performing in
496  *            pagecache pages
497  *
498  * page_cache_sync_readahead() should be called when a cache miss happened:
499  * it will submit the read.  The readahead logic may decide to piggyback more
500  * pages onto the read request if access patterns suggest it will improve
501  * performance.
502  */
503 void page_cache_sync_readahead(struct address_space *mapping,
504                                struct file_ra_state *ra, struct file *filp,
505                                pgoff_t offset, unsigned long req_size)
506 {
507         /* no read-ahead */
508         if (!ra->ra_pages)
509                 return;
510
511         /* be dumb */
512         if (filp && (filp->f_mode & FMODE_RANDOM)) {
513                 force_page_cache_readahead(mapping, filp, offset, req_size);
514                 return;
515         }
516
517         /* do read-ahead */
518         ondemand_readahead(mapping, ra, filp, false, offset, req_size);
519 }

cache miss时触发同步读，如果是FMODE_RANDOM随机读走force_page_cache_readahead()，否则走ondemand_readahead()。

先看force_page_cache_readahead()：

205 /*
206  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
207  * memory at once.
208  */
209 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210                                pgoff_t offset, unsigned long nr_to_read)
211 {
212         struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
213         struct file_ra_state *ra = &filp->f_ra;
214         unsigned long max_pages;
215
216         if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
217                 return -EINVAL;
218
219         /*
220          * If the request exceeds the readahead window, allow the read to
221          * be up to the optimal hardware IO size
222          */
223         max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
224         nr_to_read = min(nr_to_read, max_pages);
225         while (nr_to_read) {
226                 int err;
227
228                 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
229
230                 if (this_chunk > nr_to_read)
231                         this_chunk = nr_to_read;
232                 err = __do_page_cache_readahead(mapping, filp,
233                                                 offset, this_chunk, 0);
234                 if (err < 0)
235                         return err;
236
237                 offset += this_chunk;
238                 nr_to_read -= this_chunk;
239         }
240         return 0;
241 }

就是按最大2MB单元大小的chunk进行预读，逻辑很单一。下面来看ondemand_readahead()：

372 /*
373  * A minimal readahead algorithm for trivial sequential/random reads.
374  */
375 static unsigned long
376 ondemand_readahead(struct address_space *mapping,
377                    struct file_ra_state *ra, struct file *filp,
378                    bool hit_readahead_marker, pgoff_t offset,
379                    unsigned long req_size)
380 {

算法逻辑就不看了，涉及到预读窗口readahead window: (ra->start, ra->size, ra->async_size), 最终调用_do_page_cache_readahead():

142/*
143 * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
144 * the pages first, then submits them all for I/O. This avoids the very bad
145 * behaviour which would occur if page allocations are causing VM writeback.
146 * We really don't want to intermingle reads and writes like that.
147 *
148 * Returns the number of pages requested, or the maximum amount of I/O allowed.
149 */
150int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
151			pgoff_t offset, unsigned long nr_to_read,
152			unsigned long lookahead_size)

先分配all pages再开始IO:

193         /*
194          * Now start the IO.  We ignore I/O errors - if the page is not
195          * uptodate then the caller will launch readpage again, and
196          * will then handle the error.
197          */
198         if (ret)
199                 read_pages(mapping, filp, &page_pool, ret, gfp_mask);

111 static int read_pages(struct address_space *mapping, struct file *filp,
...
120         if (mapping->a_ops->readpages) {
121                 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
122                 /* Clean up the remaining pages */
123                 put_pages_list(pages);
124                 goto out;
125         }
126
127         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
128                 struct page *page = lru_to_page(pages);
129                 list_del(&page->lru);
130                 if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
131                         mapping->a_ops->readpage(filp, page);
132                 put_page(page);
133         }

优先走->readpages()，如果没有走->readpage()。

接着看generic_file_buffered_read():

2010                 if (PageReadahead(page)) {
2011                         page_cache_async_readahead(mapping,
2012                                         ra, filp, page,
2013                                         index, last_index - index);
2014                 }
2015                 if (!PageUptodate(page)) {
2016                         if (iocb->ki_flags & IOCB_NOWAIT) {
2017                                 put_page(page);
2018                                 goto would_block;
2019                         }
2020
2021                         /*
2022                          * See comment in do_read_cache_page on why
2023                          * wait_on_page_locked is used to avoid unnecessarily
2024                          * serialisations and why it's safe.
2025                          */
2026                         error = wait_on_page_locked_killable(page);

在page cache里找到了这个page，不过是PG_readahead就走个异步读。

要check page是不是最新的(2015行)，为啥, 让我们看do_read_cache_pages()里的wait_on_page_locked, ok，我们看下：

2689 static struct page *do_read_cache_page(struct address_space *mapping,
2690                                 pgoff_t index,
2691                                 int (*filler)(void *, struct page *),
2692                                 void *data,
2693                                 gfp_t gfp)
2694 {
2695         struct page *page;
2696         int err;
2697 repeat:
2698         page = find_get_page(mapping, index);
2699         if (!page) {
2700                 page = __page_cache_alloc(gfp | __GFP_COLD);
2701                 if (!page)
2702                         return ERR_PTR(-ENOMEM);
2703                 err = add_to_page_cache_lru(page, mapping, index, gfp);
2704                 if (unlikely(err)) {
2705                         put_page(page);
2706                         if (err == -EEXIST)
2707                                 goto repeat;
2708                         /* Presumably ENOMEM for radix tree node */
2709                         return ERR_PTR(err);
2710                 }
2711
2712 filler:
2713                 err = filler(data, page);
2714                 if (err < 0) {
2715                         put_page(page);
2716                         return ERR_PTR(err);
2717                 }
2718
2719                 page = wait_on_page_read(page);
2720                 if (IS_ERR(page))
2721                         return page;
2722                 goto out;
2723         }
2724         if (PageUptodate(page))
2725                 goto out;

在find_get_page()找到了这个page后，这个page可能被锁：

2727         /*
2728          * Page is not up to date and may be locked due one of the following
2729          * case a: Page is being filled and the page lock is held
2730          * case b: Read/write error clearing the page uptodate status
2731          * case c: Truncation in progress (page locked)
2732          * case d: Reclaim in progress
2733          *
...
2757          */
2758         wait_on_page_locked(page);
2759         if (PageUptodate(page))
2760                 goto out;
2761
2762         /* Distinguish between all the cases under the safety of the lock */
2763         lock_page(page);
2764
2765         /* Case c or d, restart the operation */
2766         if (!page->mapping) {
2767                 unlock_page(page);
2768                 put_page(page);
2769                 goto repeat;
2770         }
2771
2772         /* Someone else locked and filled the page in a very small window */
2773         if (PageUptodate(page)) {
2774                 unlock_page(page);
2775                 goto out;
2776         }
2777         goto filler;
2778
2779 out:
2780         mark_page_accessed(page);
2781         return page;
2782 }

可见，page的状态有多繁琐。

等待page被解锁：

516 /*
517  * Wait for a page to be unlocked.
518  *
519  * This must be called with the caller "holding" the page,
520  * ie with increased "page->count" so that the page won't
521  * go away during the wait..
522  */
523 static inline void wait_on_page_locked(struct page *page)
524 {
525         if (PageLocked(page))
526                 wait_on_page_bit(compound_head(page), PG_locked);
527 }

ok，so 等待unlock后，pageuptodate走page_ok, cp to user then next page:

2091                 /*
2092                  * Ok, we have the page, and it's up-to-date, so
2093                  * now we can copy it to user space...
2094                  */
2095
2096                 ret = copy_page_to_iter(page, offset, nr, iter);
2097                 offset += ret;
2098                 index += offset >> PAGE_SHIFT;
2099                 offset &= ~PAGE_MASK;
2100                 prev_offset = offset;
2101
2102                 put_page(page);
2103                 written += ret;
2104                 if (!iov_iter_count(iter))
2105                         goto out;
2106                 if (ret < nr) {
2107                         error = -EFAULT;
2108                         goto out;
2109                 }
2110                 continue;

简单一句话，就是pagecache找不到的就走底层的->readpage(s)(), 它只负责read disk。

Done。