主要关注page cache部分,参考Linux Kernel 4.14:

407 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
408 loff_t *pos)
409 {
410 if (file->f_op->read)
411 return file->f_op->read(file, buf, count, pos);
412 else if (file->f_op->read_iter)
413 return new_sync_read(file, buf, count, pos);
414 else
415 return -EINVAL;
416 }

->read->read_iter的接口定义:

1699 struct file_operations {
1702 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1703 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1704 ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
1705 ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);

来看new_sync_read()

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
iov_iter_init(&iter, READ, &iov, 1, len);

ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
1769 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
1770 struct iov_iter *iter)
1771 {
1772 return file->f_op->read_iter(kio, iter);
1773 }

vfs_open()时会把inode关联到file。

 858 struct file {
...
863 struct path f_path;
864 struct inode *f_inode; /* cached value */
865 const struct file_operations *f_op;
...
894 struct address_space *f_mapping;
879 int vfs_open(const struct path *path, struct file *file,
880 const struct cred *cred)
881 {
882 struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
883
884 if (IS_ERR(dentry))
885 return PTR_ERR(dentry);
886
887 file->f_path = *path;
888 return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
889 }
 714 static int do_dentry_open(struct file *f,
715 struct inode *inode,
716 int (*open)(struct inode *, struct file *),
717 const struct cred *cred)
718 {
...
726 f->f_inode = inode;
727 f->f_mapping = inode->i_mapping; //tj: pagecache
...
760 f->f_op = fops_get(inode->i_fop); //tj: f_op
2150 #define fops_get(fops) \
2151 (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))

vfs有个通用的generic_file_read_iter()接口,比如f2fs就直接用了:

2795 const struct file_operations f2fs_file_operations = {
2796 .llseek = f2fs_llseek,
2797 .read_iter = generic_file_read_iter,

在mm/filemap.c:

2214 /**
2215 * generic_file_read_iter - generic filesystem read routine
2216 * @iocb: kernel I/O control block
2217 * @iter: destination for the data read
2218 *
2219 * This is the "read_iter()" routine for all filesystems
2220 * that can use the page cache directly.
2221 */
2222 ssize_t
2223 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2224 {
2225 size_t count = iov_iter_count(iter);
2226 ssize_t retval = 0;
2227
2228 if (!count)
2229 goto out; /* skip atime */
2230
2231 if (iocb->ki_flags & IOCB_DIRECT) {

IOCB_DIRECT不走page cache。

2272
2273 retval = generic_file_buffered_read(iocb, iter, retval);
2274 out:
2275 return retval;
2276 }

来看 generic_file_buffered_read():

1949 /**
1950 * generic_file_buffered_read - generic file read routine
1951 * @iocb: the iocb to read
1952 * @iter: data destination
1953 * @written: already copied
1954 *
1955 * This is a generic file read routine, and uses the
1956 * mapping->a_ops->readpage() function for the actual low-level stuff.
1957 *
1958 * This is really ugly. But the goto's actually try to clarify some
1959 * of the logic when it comes to error handling etc.
1960 */
1961 static ssize_t generic_file_buffered_read(struct kiocb *iocb,
1962 struct iov_iter *iter, ssize_t written)
1963 {

函数头注释已经提到了这里会用底层的mapping->a_ops->readpage()

1980         index = *ppos >> PAGE_SHIFT;
1981 prev_index = ra->prev_pos >> PAGE_SHIFT;
1982 prev_offset = ra->prev_pos & (PAGE_SIZE-1);
1983 last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
1984 offset = *ppos & ~PAGE_MASK;

读范围:indexlast_index

1999                 page = find_get_page(mapping, index);

find_get_page()在page cache里找这个index偏移的page:

263 /**
264 * find_get_page - find and get a page reference
265 * @mapping: the address_space to search
266 * @offset: the page index
267 *
268 * Looks up the page cache slot at @mapping & @offset. If there is a
269 * page cache page, it is returned with an increased refcount.
270 *
271 * Otherwise, %NULL is returned.
272 */
273 static inline struct page *find_get_page(struct address_space *mapping,
274 pgoff_t offset)
275 {
276 return pagecache_get_page(mapping, offset, 0, 0);
277 }
2000                 if (!page) {
2001 if (iocb->ki_flags & IOCB_NOWAIT)
2002 goto would_block;
2003 page_cache_sync_readahead(mapping,
2004 ra, filp,
2005 index, last_index - index);
2006 page = find_get_page(mapping, index);
2007 if (unlikely(page == NULL))
2008 goto no_cached_page;
2009 }

如果找不到,就page_cache_sync_readahead( , , , offset, req_size)同步预读下,再找还找不到就走no_cached_page(可能性小)。来看同步读:

489 /**
490 * page_cache_sync_readahead - generic file readahead
491 * @mapping: address_space which holds the pagecache and I/O vectors
492 * @ra: file_ra_state which holds the readahead state
493 * @filp: passed on to ->readpage() and ->readpages()
494 * @offset: start offset into @mapping, in pagecache page-sized units
495 * @req_size: hint: total size of the read which the caller is performing in
496 * pagecache pages
497 *
498 * page_cache_sync_readahead() should be called when a cache miss happened:
499 * it will submit the read. The readahead logic may decide to piggyback more
500 * pages onto the read request if access patterns suggest it will improve
501 * performance.
502 */
503 void page_cache_sync_readahead(struct address_space *mapping,
504 struct file_ra_state *ra, struct file *filp,
505 pgoff_t offset, unsigned long req_size)
506 {
507 /* no read-ahead */
508 if (!ra->ra_pages)
509 return;
510
511 /* be dumb */
512 if (filp && (filp->f_mode & FMODE_RANDOM)) {
513 force_page_cache_readahead(mapping, filp, offset, req_size);
514 return;
515 }
516
517 /* do read-ahead */
518 ondemand_readahead(mapping, ra, filp, false, offset, req_size);
519 }

cache miss时触发同步读, 如果是FMODE_RANDOM随机读走force_page_cache_readahead(),否则走ondemand_readahead()

先看force_page_cache_readahead()

205 /*
206 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
207 * memory at once.
208 */
209 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210 pgoff_t offset, unsigned long nr_to_read)
211 {
212 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
213 struct file_ra_state *ra = &filp->f_ra;
214 unsigned long max_pages;
215
216 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
217 return -EINVAL;
218
219 /*
220 * If the request exceeds the readahead window, allow the read to
221 * be up to the optimal hardware IO size
222 */
223 max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
224 nr_to_read = min(nr_to_read, max_pages);
225 while (nr_to_read) {
226 int err;
227
228 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
229
230 if (this_chunk > nr_to_read)
231 this_chunk = nr_to_read;
232 err = __do_page_cache_readahead(mapping, filp,
233 offset, this_chunk, 0);
234 if (err < 0)
235 return err;
236
237 offset += this_chunk;
238 nr_to_read -= this_chunk;
239 }
240 return 0;
241 }

就是按最大2MB单元大小的chunk进行预读,逻辑很单一。下面来看ondemand_readahead()

372 /*
373 * A minimal readahead algorithm for trivial sequential/random reads.
374 */
375 static unsigned long
376 ondemand_readahead(struct address_space *mapping,
377 struct file_ra_state *ra, struct file *filp,
378 bool hit_readahead_marker, pgoff_t offset,
379 unsigned long req_size)
380 {

算法逻辑就不看了,涉及到预读窗口readahead window: (ra->start, ra->size, ra->async_size), 最终调用_do_page_cache_readahead():

142/*
143 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
144 * the pages first, then submits them all for I/O. This avoids the very bad
145 * behaviour which would occur if page allocations are causing VM writeback.
146 * We really don't want to intermingle reads and writes like that.
147 *
148 * Returns the number of pages requested, or the maximum amount of I/O allowed.
149 */
150int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
151 pgoff_t offset, unsigned long nr_to_read,
152 unsigned long lookahead_size)

先分配all pages再开始IO:

193         /*
194 * Now start the IO. We ignore I/O errors - if the page is not
195 * uptodate then the caller will launch readpage again, and
196 * will then handle the error.
197 */
198 if (ret)
199 read_pages(mapping, filp, &page_pool, ret, gfp_mask);
111 static int read_pages(struct address_space *mapping, struct file *filp,
...
120 if (mapping->a_ops->readpages) {
121 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
122 /* Clean up the remaining pages */
123 put_pages_list(pages);
124 goto out;
125 }
126
127 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
128 struct page *page = lru_to_page(pages);
129 list_del(&page->lru);
130 if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
131 mapping->a_ops->readpage(filp, page);
132 put_page(page);
133 }

优先走->readpages(),如果没有走->readpage()

接着看generic_file_buffered_read():

2010                 if (PageReadahead(page)) {
2011 page_cache_async_readahead(mapping,
2012 ra, filp, page,
2013 index, last_index - index);
2014 }
2015 if (!PageUptodate(page)) {
2016 if (iocb->ki_flags & IOCB_NOWAIT) {
2017 put_page(page);
2018 goto would_block;
2019 }
2020
2021 /*
2022 * See comment in do_read_cache_page on why
2023 * wait_on_page_locked is used to avoid unnecessarily
2024 * serialisations and why it's safe.
2025 */
2026 error = wait_on_page_locked_killable(page);

在page cache里找到了这个page,不过是PG_readahead就走个异步读。

要check page是不是最新的(2015行),为啥, 让我们看do_read_cache_pages()里的wait_on_page_locked, ok,我们看下:

2689 static struct page *do_read_cache_page(struct address_space *mapping,
2690 pgoff_t index,
2691 int (*filler)(void *, struct page *),
2692 void *data,
2693 gfp_t gfp)
2694 {
2695 struct page *page;
2696 int err;
2697 repeat:
2698 page = find_get_page(mapping, index);
2699 if (!page) {
2700 page = __page_cache_alloc(gfp | __GFP_COLD);
2701 if (!page)
2702 return ERR_PTR(-ENOMEM);
2703 err = add_to_page_cache_lru(page, mapping, index, gfp);
2704 if (unlikely(err)) {
2705 put_page(page);
2706 if (err == -EEXIST)
2707 goto repeat;
2708 /* Presumably ENOMEM for radix tree node */
2709 return ERR_PTR(err);
2710 }
2711
2712 filler:
2713 err = filler(data, page);
2714 if (err < 0) {
2715 put_page(page);
2716 return ERR_PTR(err);
2717 }
2718
2719 page = wait_on_page_read(page);
2720 if (IS_ERR(page))
2721 return page;
2722 goto out;
2723 }
2724 if (PageUptodate(page))
2725 goto out;

find_get_page()找到了这个page后,这个page可能被锁:

2727         /*
2728 * Page is not up to date and may be locked due one of the following
2729 * case a: Page is being filled and the page lock is held
2730 * case b: Read/write error clearing the page uptodate status
2731 * case c: Truncation in progress (page locked)
2732 * case d: Reclaim in progress
2733 *
...
2757 */
2758 wait_on_page_locked(page);
2759 if (PageUptodate(page))
2760 goto out;
2761
2762 /* Distinguish between all the cases under the safety of the lock */
2763 lock_page(page);
2764
2765 /* Case c or d, restart the operation */
2766 if (!page->mapping) {
2767 unlock_page(page);
2768 put_page(page);
2769 goto repeat;
2770 }
2771
2772 /* Someone else locked and filled the page in a very small window */
2773 if (PageUptodate(page)) {
2774 unlock_page(page);
2775 goto out;
2776 }
2777 goto filler;
2778
2779 out:
2780 mark_page_accessed(page);
2781 return page;
2782 }

可见,page的状态有多繁琐。

等待page被解锁:

516 /*
517 * Wait for a page to be unlocked.
518 *
519 * This must be called with the caller "holding" the page,
520 * ie with increased "page->count" so that the page won't
521 * go away during the wait..
522 */
523 static inline void wait_on_page_locked(struct page *page)
524 {
525 if (PageLocked(page))
526 wait_on_page_bit(compound_head(page), PG_locked);
527 }

ok,so 等待unlock后,pageuptodate走page_ok, cp to user then next page:

2091                 /*
2092 * Ok, we have the page, and it's up-to-date, so
2093 * now we can copy it to user space...
2094 */
2095
2096 ret = copy_page_to_iter(page, offset, nr, iter);
2097 offset += ret;
2098 index += offset >> PAGE_SHIFT;
2099 offset &= ~PAGE_MASK;
2100 prev_offset = offset;
2101
2102 put_page(page);
2103 written += ret;
2104 if (!iov_iter_count(iter))
2105 goto out;
2106 if (ret < nr) {
2107 error = -EFAULT;
2108 goto out;
2109 }
2110 continue;

简单一句话,就是pagecache找不到的就走底层的->readpage(s)(), 它只负责read disk。

Done。