Linux 文件系统写-ext2流程

整体写流程如下

write函数主要分为三步：获取page cache，拷贝数据到page cache，设置inode为脏将入b_dirty;
每次写完都要进行bdi dirty 平衡判断，如果需要回写，则唤醒回写线程进行真正的回写；

函数注释如下

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,size_t, count)
{struct fd f = fdget_pos(fd);ssize_t ret = -EBADF;if (f.file) {loff_t pos = file_pos_read(f.file);ret = vfs_write(f.file, buf, count, &pos);if (ret >= 0)file_pos_write(f.file, pos);fdput_pos(f);}return ret;
}

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{ssize_t ret;if (!(file->f_mode & FMODE_WRITE))return -EBADF;if (!(file->f_mode & FMODE_CAN_WRITE))return -EINVAL;if (unlikely(!access_ok(VERIFY_READ, buf, count)))return -EFAULT;ret = rw_verify_area(WRITE, file, pos, count);if (ret >= 0) {count = ret;file_start_write(file);if (file->f_op->write)ret = file->f_op->write(file, buf, count, pos);else if (file->f_op->aio_write)ret = do_sync_write(file, buf, count, pos);elseret = new_sync_write(file, buf, count, pos);if (ret > 0) {fsnotify_modify(file);add_wchar(current, ret);}inc_syscw(current);file_end_write(file);}return ret;
}

ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };struct kiocb kiocb;struct iov_iter iter;ssize_t ret;init_sync_kiocb(&kiocb, filp);kiocb.ki_pos = *ppos;kiocb.ki_nbytes = len;iov_iter_init(&iter, WRITE, &iov, 1, len);ret = filp->f_op->write_iter(&kiocb, &iter);if (-EIOCBQUEUED == ret)ret = wait_on_sync_kiocb(&kiocb);*ppos = kiocb.ki_pos;return ret;
}

ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{struct file *file = iocb->ki_filp;struct inode *inode = file->f_mapping->host;ssize_t ret;mutex_lock(&inode->i_mutex);ret = __generic_file_write_iter(iocb, from);mutex_unlock(&inode->i_mutex);if (ret > 0) {ssize_t err;err = generic_write_sync(file, iocb->ki_pos - ret, ret);if (err < 0)ret = err;}return ret;
}

ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{struct file *file = iocb->ki_filp;struct address_space * mapping = file->f_mapping;struct inode    *inode = mapping->host;loff_t       pos = iocb->ki_pos;ssize_t      written = 0;ssize_t        err;ssize_t     status;size_t       count = iov_iter_count(from);/* We can write back this queue in page reclaim */current->backing_dev_info = inode_to_bdi(inode);err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));if (err)goto out;if (count == 0)goto out;iov_iter_truncate(from, count);err = file_remove_suid(file);if (err)goto out;/* 更新文件时间 */err = file_update_time(file);if (err)goto out;/* 直接读写 */if (io_is_direct(file)) {loff_t endbyte;written = generic_file_direct_write(iocb, from, pos);/** If the write stopped short of completing, fall back to* buffered writes.  Some filesystems do this for writes to* holes, for example.  For DAX files, a buffered write will* not succeed (even if it did, DAX does not handle dirty* page-cache pages correctly).*/if (written < 0 || written == count || IS_DAX(inode))goto out;pos += written;count -= written;status = generic_perform_write(file, from, pos);/** If generic_perform_write() returned a synchronous error* then we want to return the number of bytes which were* direct-written, or the error code if that was zero.  Note* that this differs from normal direct-io semantics, which* will return -EFOO even if some bytes were written.*/if (unlikely(status < 0)) {err = status;goto out;}iocb->ki_pos = pos + status;/** We need to ensure that the page cache pages are written to* disk and invalidated to preserve the expected O_DIRECT* semantics.*/endbyte = pos + status - 1;err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);if (err == 0) {written += status;invalidate_mapping_pages(mapping,pos >> PAGE_CACHE_SHIFT,endbyte >> PAGE_CACHE_SHIFT);} else {/** We don't know how much we wrote, so just return* the number of bytes which were direct-written*/}} else {/* page cache */written = generic_perform_write(file, from, pos);if (likely(written >= 0))iocb->ki_pos = pos + written;}
out:current->backing_dev_info = NULL;return written ? written : err;
}

ssize_t generic_perform_write(struct file *file,struct iov_iter *i, loff_t pos)
{struct address_space *mapping = file->f_mapping;const struct address_space_operations *a_ops = mapping->a_ops;long status = 0;ssize_t written = 0;unsigned int flags = 0;/** Copies from kernel address space cannot fail (NFSD is a big user).*/if (!iter_is_iovec(i))flags |= AOP_FLAG_UNINTERRUPTIBLE;do {struct page *page;unsigned long offset;   /* Offset into pagecache page */unsigned long bytes;    /* Bytes to write to page */size_t copied;      /* Bytes copied from user */void *fsdata;offset = (pos & (PAGE_CACHE_SIZE - 1));bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,iov_iter_count(i));again:/** Bring in the user page that we will copy from _first_.* Otherwise there's a nasty deadlock on copying from the* same page as we're writing to, without it being marked* up-to-date.** Not only is this an optimisation, but it is also required* to check that the address is actually valid, when atomic* usercopies are used, below.*/if (unlikely(iov_iter_fault_in_readable(i, bytes))) {status = -EFAULT;break;}/* 1: 准备开始写，输入参数pos、bytes、page */status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata);if (unlikely(status < 0))break;if (mapping_writably_mapped(mapping))flush_dcache_page(page);/*2: 写 page cache */copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);flush_dcache_page(page);/* 3: 结束写 */status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata);if (unlikely(status < 0))break;copied = status;cond_resched();iov_iter_advance(i, copied);if (unlikely(copied == 0)) {/** If we were unable to copy any data at all, we must* fall back to a single segment length write.** If we didn't fallback here, we could livelock* because not all segments in the iov can be copied at* once without a pagefault.*/bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,iov_iter_single_seg_count(i));goto again;}pos += copied;written += copied;/* 4: 是否需要回写 */balance_dirty_pages_ratelimited(mapping);if (fatal_signal_pending(current)) {status = -EINTR;break;}} while (iov_iter_count(i));return written ? written : status;
}


/** balance_dirty_pages() must be called by processes which are generating dirty* data.  It looks at the number of dirty pages in the machine and will force* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.* If we're over `background_thresh' then the writeback threads are woken to* perform some writeout.*/
static void balance_dirty_pages(struct address_space *mapping,unsigned long pages_dirtied)
{unsigned long nr_reclaimable;  /* = file_dirty + unstable_nfs */unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */unsigned long background_thresh;unsigned long dirty_thresh;long period;long pause;long max_pause;long min_pause;int nr_dirtied_pause;bool dirty_exceeded = false;unsigned long task_ratelimit;unsigned long dirty_ratelimit;unsigned long pos_ratio;struct backing_dev_info *bdi = inode_to_bdi(mapping->host);bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; //单独门限值回收unsigned long start_time = jiffies;for (;;) {unsigned long now = jiffies;unsigned long uninitialized_var(bdi_thresh);unsigned long thresh;unsigned long uninitialized_var(bdi_dirty);unsigned long dirty;unsigned long bg_thresh;/** Unstable writes are a feature of certain networked* filesystems (i.e. NFS) in which data may have been* written to the server's write cache, but has not yet* been flushed to permanent storage.*/nr_reclaimable = global_page_state(NR_FILE_DIRTY) +global_page_state(NR_UNSTABLE_NFS);  /* 全局 文件脏页  + 网络文件系统 */nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); /*全局 文件总的脏页+包括正在回写 */global_dirty_limits(&background_thresh, &dirty_thresh);//获取两个门限值if (unlikely(strictlimit)) {  /* 单独bdi回收 */bdi_dirty_limits(bdi, dirty_thresh, background_thresh,&bdi_dirty, &bdi_thresh, &bg_thresh);dirty = bdi_dirty;thresh = bdi_thresh;} else {                       /* 全局回收 */dirty = nr_dirty;          /* 全局 文件总的脏页+包括正在回写 */thresh = dirty_thresh;bg_thresh = background_thresh;}/** Throttle it only when the background writeback cannot* catch-up. This avoids (excessively) small writeouts* when the bdi limits are ramping up in case of !strictlimit.** In strictlimit case make decision based on the bdi counters* and limits. Small writeouts when the bdi limits are ramping* up are the price we consciously pay for strictlimit-ing.*/if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {  //(thresh + bg_thresh) / 2; 不回收current->dirty_paused_when = now;current->nr_dirtied = 0;   /* 脏页数量重新置0 */current->nr_dirtied_pause =dirty_poll_interval(dirty, thresh);break;}if (unlikely(!writeback_in_progress(bdi)))  /* 唤醒真正的回写线程 */bdi_start_background_writeback(bdi);if (!strictlimit)bdi_dirty_limits(bdi, dirty_thresh, background_thresh,&bdi_dirty, &bdi_thresh, NULL);//nr_dirty > dirty_threshdirty_exceeded = (bdi_dirty > bdi_thresh) &&((nr_dirty > dirty_thresh) || strictlimit); //超过门限if (dirty_exceeded && !bdi->dirty_exceeded)bdi->dirty_exceeded = 1;                        //超过门限，后面需要加速回收bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,nr_dirty, bdi_thresh, bdi_dirty,start_time);dirty_ratelimit = bdi->dirty_ratelimit;pos_ratio = bdi_position_ratio(bdi, dirty_thresh,background_thresh, nr_dirty,bdi_thresh, bdi_dirty);task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>RATELIMIT_CALC_SHIFT;max_pause = bdi_max_pause(bdi, bdi_dirty);min_pause = bdi_min_pause(bdi, max_pause,task_ratelimit, dirty_ratelimit,&nr_dirtied_pause);if (unlikely(task_ratelimit == 0)) {period = max_pause;pause = max_pause;goto pause;}period = HZ * pages_dirtied / task_ratelimit;pause = period;if (current->dirty_paused_when)pause -= now - current->dirty_paused_when;/** For less than 1s think time (ext3/4 may block the dirtier* for up to 800ms from time to time on 1-HDD; so does xfs,* however at much less frequency), try to compensate it in* future periods by updating the virtual time; otherwise just* do a reset, as it may be a light dirtier.*/if (pause < min_pause) {trace_balance_dirty_pages(bdi,dirty_thresh,background_thresh,nr_dirty,bdi_thresh,bdi_dirty,dirty_ratelimit,task_ratelimit,pages_dirtied,period,min(pause, 0L),start_time);if (pause < -HZ) {current->dirty_paused_when = now;current->nr_dirtied = 0;} else if (period) {current->dirty_paused_when += period;current->nr_dirtied = 0;} else if (current->nr_dirtied_pause <= pages_dirtied)current->nr_dirtied_pause += pages_dirtied;break;}if (unlikely(pause > max_pause)) {/* for occasional dropped task_ratelimit */now += min(pause - max_pause, max_pause);pause = max_pause;}pause:trace_balance_dirty_pages(bdi,dirty_thresh,background_thresh,nr_dirty,bdi_thresh,bdi_dirty,dirty_ratelimit,task_ratelimit,pages_dirtied,period,pause,start_time);__set_current_state(TASK_KILLABLE);io_schedule_timeout(pause);//有可能会切出去，但最大超过200mscurrent->dirty_paused_when = now + pause;current->nr_dirtied = 0;current->nr_dirtied_pause = nr_dirtied_pause;/** This is typically equal to (nr_dirty < dirty_thresh) and can* also keep "1000+ dd on a slow USB stick" under control.*/if (task_ratelimit)break;/** In the case of an unresponding NFS server and the NFS dirty* pages exceeds dirty_thresh, give the other good bdi's a pipe* to go through, so that tasks on them still remain responsive.** In theory 1 page is enough to keep the comsumer-producer* pipe going: the flusher cleans 1 page => the task dirties 1* more page. However bdi_dirty has accounting errors.  So use* the larger and more IO friendly bdi_stat_error.*/if (bdi_dirty <= bdi_stat_error(bdi))break;if (fatal_signal_pending(current))break;}if (!dirty_exceeded && bdi->dirty_exceeded)  //如果不超过门限，则置0bdi->dirty_exceeded = 0;if (writeback_in_progress(bdi))  //正在回收，则退出return;/** In laptop mode, we wait until hitting the higher threshold before* starting background writeout, and then write out all the way down* to the lower threshold.  So slow writers cause minimal disk activity.** In normal mode, we start background writeout at the lower* background_thresh, to keep the amount of dirty memory low.*/if (laptop_mode)return;if (nr_reclaimable > background_thresh) //可回收的页面大于background_thresh，则触发线程异步回收bdi_start_background_writeback(bdi);
}

void bdi_start_background_writeback(struct backing_dev_info *bdi)
{/** We just wake up the flusher thread. It will perform background* writeback as soon as there is no other work to do.*/trace_writeback_wake_background(bdi);bdi_wakeup_thread(bdi);
}

/* 这里并没有wb_writeback_work加入到bdi->work_list 也就是唤醒的线程没有work要处理 */

static void bdi_wakeup_thread(struct backing_dev_info *bdi)
{/* 这里并没有wb_writeback_work加入到bdi->work_list */spin_lock_bh(&bdi->wb_lock);if (test_bit(BDI_registered, &bdi->state))mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);spin_unlock_bh(&bdi->wb_lock);
}

回收处理

主要有三种回收：work回收、周期回收过期数据、超过门限回收；

static long wb_do_writeback(struct bdi_writeback *wb)
{struct backing_dev_info *bdi = wb->bdi;struct wb_writeback_work *work;long wrote = 0;set_bit(BDI_writeback_running, &wb->bdi->state);while ((work = get_next_work_item(bdi)) != NULL) {  //1：work触发回收trace_writeback_exec(bdi, work);wrote += wb_writeback(wb, work);/** Notify the caller of completion if this is a synchronous* work item, otherwise just free it.*/if (work->done)complete(work->done);elsekfree(work);}/** Check for periodic writeback, kupdated() style*/wrote += wb_check_old_data_flush(wb);    //2：过期一定时间的page回收（默认30S）wrote += wb_check_background_flush(wb);  //3：门限回收clear_bit(BDI_writeback_running, &wb->bdi->state);return wrote;
}

先来看wb_check_old_data_flush和wb_check_background_flush的wb_writeback_work设置：
这两个属于系统设置的回收：
wb_check_old_data_flush：超过一定时间的脏页回收，一般为30S，5S判断一次；（dirty_writeback_interval、dirty_expire_interval）
wb_check_background_flush：超过门限进行回收；设置参数为（dirty_background_bytes、dirty_background_ratio）

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{unsigned long expired;long nr_pages;/** When set to zero, disable periodic writeback*/if (!dirty_writeback_interval)return 0;expired = wb->last_old_flush +msecs_to_jiffies(dirty_writeback_interval * 10);if (time_before(jiffies, expired))return 0;/* 更新本次运行的起始时间 */wb->last_old_flush = jiffies;  nr_pages = get_nr_dirty_pages();if (nr_pages) {struct wb_writeback_work work = {.nr_pages = nr_pages,.sync_mode  = WB_SYNC_NONE,.for_kupdate    = 1,.range_cyclic  = 1,.reason        = WB_REASON_PERIODIC,};return wb_writeback(wb, &work);}return 0;
}static long wb_check_background_flush(struct bdi_writeback *wb)
{if (over_bground_thresh(wb->bdi)) {struct wb_writeback_work work = {.nr_pages  = LONG_MAX,  /* 页数设置为无穷， */.sync_mode   = WB_SYNC_NONE,.for_background = 1, .range_cyclic = 1,.reason        = WB_REASON_BACKGROUND,};return wb_writeback(wb, &work);}return 0;
}

static long wb_writeback(struct bdi_writeback *wb,struct wb_writeback_work *work)
{unsigned long wb_start = jiffies;long nr_pages = work->nr_pages;unsigned long oldest_jif;struct inode *inode;long progress;oldest_jif = jiffies;work->older_than_this = &oldest_jif; //指针赋值spin_lock(&wb->list_lock);/* * 结束的条件为回写了nr_pages（达到了回收的目标），或者回收完了* wb_check_background_flush进入时nr_pages设置为无穷，退出循环为达到门限以下； */for (;;) {/** Stop writeback when nr_pages has been consumed*/if (work->nr_pages <= 0)break;/** Background writeout and kupdate-style writeback may* run forever. Stop them if there is other work to do* so that e.g. sync can proceed. They'll be restarted* after the other works are all done.*//* 优先处理work回收，门限回收和周期回收优先级低 */if ((work->for_background || work->for_kupdate) &&!list_empty(&wb->bdi->work_list))break;/** For background writeout, stop when we are below the* background dirty threshold*//* 背景回收先判断是否超过门限，如果在门限以下则退出 */if (work->for_background && !over_bground_thresh(wb->bdi)) break;/** Kupdate and background works are special and we want to* include all inodes that need writing. Livelock avoidance is* handled by these works yielding to any other work so we are* safe.*/if (work->for_kupdate) {oldest_jif = jiffies -msecs_to_jiffies(dirty_expire_interval * 10); /* 过期的数据需要回写，默认30S之前。在queue_io中使用 */} else if (work->for_background)oldest_jif = jiffies;                             /*  */trace_writeback_start(wb->bdi, work);if (list_empty(&wb->b_io))          /* 如果bio队列为空，需要从more_io和dirty_io中拿一些过来 */queue_io(wb, work);if (work->sb)progress = writeback_sb_inodes(work->sb, wb, work);  /* 一般为这个分支 */elseprogress = __writeback_inodes_wb(wb, work);trace_writeback_written(wb->bdi, work);wb_update_bandwidth(wb, wb_start);/** Did we write something? Try for more** Dirty inodes are moved to b_io for writeback in batches.* The completion of the current batch does not necessarily* mean the overall work is done. So we keep looping as long* as made some progress on cleaning pages or inodes.*//* 如果这次循环触发了写inode,则继续循环判断 */if (progress)continue;/** No more inodes for IO, bail*//* 本次没写且more_io页为空，说明没有可写的了，退出 */if (list_empty(&wb->b_more_io))break;/** Nothing written. Wait for some inode to* become available for writeback. Otherwise* we'll just busyloop.*//* 如果每次没有写且more_io不为空，说明别的地方调用了回写，此处等待回写完成 */if (!list_empty(&wb->b_more_io))  {trace_writeback_wait(wb->bdi, work);inode = wb_inode(wb->b_more_io.prev);spin_lock(&inode->i_lock);spin_unlock(&wb->list_lock);/* This function drops i_lock... */inode_sleep_on_writeback(inode);spin_lock(&wb->list_lock);}}spin_unlock(&wb->list_lock);return nr_pages - work->nr_pages;
}


/** Write a portion of b_io inodes which belong to @sb.** Return the number of pages and/or inodes written.*/
static long writeback_sb_inodes(struct super_block *sb,struct bdi_writeback *wb,struct wb_writeback_work *work)
{struct writeback_control wbc = {.sync_mode        = work->sync_mode,.tagged_writepages    = work->tagged_writepages,.for_kupdate      = work->for_kupdate,.for_background     = work->for_background,.for_sync        = work->for_sync,.range_cyclic      = work->range_cyclic,.range_start       = 0,.range_end     = LLONG_MAX,  /* 全部范围 */};unsigned long start_time = jiffies;long write_chunk;long wrote = 0;  /* count both pages and inodes */while (!list_empty(&wb->b_io)) {struct inode *inode = wb_inode(wb->b_io.prev);/** 1: 两个inode属于不同super block，函数需要返回重新进来* 2：如果work设置了sb,则只回收这个sb的inode，其他继续放在链表b_dirty中*/if (inode->i_sb != sb) {if (work->sb) {/** We only want to write back data for this* superblock, move all inodes not belonging* to it back onto the dirty list.*/redirty_tail(inode, wb);continue;}/** The inode belongs to a different superblock.* Bounce back to the caller to unpin this and* pin the next superblock.*/break;}/** Don't bother with new inodes or inodes being freed, first* kind does not need periodic writeout yet, and for the latter* kind writeout is handled by the freer.*/spin_lock(&inode->i_lock);  /* 自旋锁 *//* 如果该inode刚刚被创建或者即将被销毁，那么将其重新放入dirty_io链表，等待下次被回写 */if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {spin_unlock(&inode->i_lock);redirty_tail(inode, wb);continue;}/* 如果是为了内存回收而写回脏数据，则将其加入到more_io中 */if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {  /* 一般sync_mode设置为WB_SYNC_NONE *//** If this inode is locked for writeback and we are not* doing writeback-for-data-integrity, move it to* b_more_io so that writeback can proceed with the* other inodes on s_io.** We'll have another go at writing back this inode* when we completed a full scan of b_io.*/spin_unlock(&inode->i_lock);requeue_io(inode, wb);                                       /* 正在回收的inode进入more_io中！！！！ */trace_writeback_sb_inodes_requeue(inode);continue;}spin_unlock(&wb->list_lock);/** We already requeued the inode if it had I_SYNC set and we* are doing WB_SYNC_NONE writeback. So this catches only the* WB_SYNC_ALL case.*//* 如果是为了数据完整性而进行的数据回写，则等待回写完成，写完之后会唤醒 */if (inode->i_state & I_SYNC) {/* Wait for I_SYNC. This function drops i_lock... */inode_sleep_on_writeback(inode);/* Inode may be gone, start again */spin_lock(&wb->list_lock);continue;}inode->i_state |= I_SYNC;        /* 设置该inode正在回写标志 */spin_unlock(&inode->i_lock);write_chunk = writeback_chunk_size(wb->bdi, work);wbc.nr_to_write = write_chunk;   /* 一般为4096     */wbc.pages_skipped = 0;           /* 先设置为0 *//** We use I_SYNC to pin the inode in memory. While it is set* evict_inode() will wait so the inode cannot be freed.*/__writeback_single_inode(inode, &wbc);  /* 回写 */work->nr_pages -= write_chunk - wbc.nr_to_write;wrote += write_chunk - wbc.nr_to_write;spin_lock(&wb->list_lock);         /* 操作前需要先获得锁 */spin_lock(&inode->i_lock);if (!(inode->i_state & I_DIRTY_ALL))wrote++;requeue_inode(inode, wb, &wbc);    /* 根据状态重新入队 */inode_sync_complete(inode);        /* 清除I_SYNC标志，同时唤醒正在等待inode_sleep_on_writeback */spin_unlock(&inode->i_lock);    cond_resched_lock(&wb->list_lock);/** bail out to wb_writeback() often enough to check* background threshold and other termination conditions.*/if (wrote) {if (time_is_before_jiffies(start_time + HZ / 10UL))break;if (work->nr_pages <= 0)break;}}return wrote;
}

static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{struct address_space *mapping = inode->i_mapping;long nr_to_write = wbc->nr_to_write;unsigned dirty;int ret;WARN_ON(!(inode->i_state & I_SYNC));trace_writeback_single_inode_start(inode, wbc, nr_to_write);ret = do_writepages(mapping, wbc);  /* 触发inode写 *//** Make sure to wait on the data before writing out the metadata.* This is important for filesystems that modify metadata on data* I/O completion. We don't do it for sync(2) writeback because it has a* separate, external IO completion path and ->sync_fs for guaranteeing* inode metadata is written back correctly.*/if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {int err = filemap_fdatawait(mapping);      /* 等待回写完成 */if (ret == 0)ret = err;}/** Some filesystems may redirty the inode during the writeback* due to delalloc, clear dirty metadata flags right before* write_inode()*/spin_lock(&inode->i_lock);dirty = inode->i_state & I_DIRTY;if (inode->i_state & I_DIRTY_TIME) {if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||unlikely(time_after(jiffies,(inode->dirtied_time_when +dirtytime_expire_interval * HZ)))) {dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;trace_writeback_lazytime(inode);}} elseinode->i_state &= ~I_DIRTY_TIME_EXPIRED;inode->i_state &= ~dirty;/** Paired with smp_mb() in __mark_inode_dirty().  This allows* __mark_inode_dirty() to test i_state without grabbing i_lock -* either they see the I_DIRTY bits cleared or we see the dirtied* inode.** I_DIRTY_PAGES is always cleared together above even if @mapping* still has dirty pages.  The flag is reinstated after smp_mb() if* necessary.  This guarantees that either __mark_inode_dirty()* sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.*/smp_mb();if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))inode->i_state |= I_DIRTY_PAGES;spin_unlock(&inode->i_lock);if (dirty & I_DIRTY_TIME)mark_inode_dirty_sync(inode);/* Don't write the inode if only I_DIRTY_PAGES was set *//* 如果只有文件数据修改，则不需要回写inode,否则要回写inode */if (dirty & ~I_DIRTY_PAGES) {int err = write_inode(inode, wbc);  /* write_inode是一个同步函数 */if (ret == 0)ret = err;}trace_writeback_single_inode(inode, wbc, nr_to_write);return ret;
}

问题一：
dirty_writeback_interval、dirty_writeback_interval、两个参数怎么起作用？
dirty_writeback_interval作用在wb_check_old_data_flush，如果时间未到，不触发回收；
dirty_writeback_interval作用在move_expired_inodes只有超时的inode才移动到b_io中进行回收


static long wb_check_old_data_flush(struct bdi_writeback *wb)
{unsigned long expired;long nr_pages;/** When set to zero, disable periodic writeback*/if (!dirty_writeback_interval)return 0;expired = wb->last_old_flush +msecs_to_jiffies(dirty_writeback_interval * 10);if (time_before(jiffies, expired))return 0;...return 0;
}在static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{int moved;assert_spin_locked(&wb->list_lock);list_splice_init(&wb->b_more_io, &wb->b_io);                   //more_io 无条件移动到 wb_iomoved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); //移动超时的节点b_dirty->b_iomoved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,EXPIRE_DIRTY_ATIME, work);trace_writeback_queue_io(wb, work, moved);
}/** Move expired (dirtied before work->older_than_this) dirty inodes from* @delaying_queue to @dispatch_queue.*/
static int move_expired_inodes(struct list_head *delaying_queue,struct list_head *dispatch_queue,int flags,struct wb_writeback_work *work)
{unsigned long *older_than_this = NULL;unsigned long expire_time;LIST_HEAD(tmp);struct list_head *pos, *node;struct super_block *sb = NULL;struct inode *inode;int do_sb_sort = 0;int moved = 0;/* * 对于wb_check_old_data_flush来说，older_than_this = work->older_than_this;* 对于wb_check_background_flush来说，for_sync=0，expire_time = jiffies - (dirtytime_expire_interval * HZ)* older_than_this 为空的话，则不比较时间，所有都移动。什么情况下？？？*/if ((flags & EXPIRE_DIRTY_ATIME) == 0)          /* 使用传入进来的参数 */older_than_this = work->older_than_this;else if (!work->for_sync) {                     /* 如果没有强制刷新标志，则设置时间为默认30S之前 */expire_time = jiffies - (dirtytime_expire_interval * HZ);older_than_this = &expire_time;          }                                               /* 其他的情况，不比较时间，都移动 */while (!list_empty(delaying_queue)) {inode = wb_inode(delaying_queue->prev);/* 如果当前节点没有过期的话，则后面的节点肯定没有过期，则直接退出即可 */if (older_than_this &&inode_dirtied_after(inode, *older_than_this))break;list_move(&inode->i_wb_list, &tmp);moved++;if (flags & EXPIRE_DIRTY_ATIME)set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);if (sb_is_blkdev_sb(inode->i_sb))continue;if (sb && sb != inode->i_sb)do_sb_sort = 1;sb = inode->i_sb;}/* just one sb in list, splice to dispatch_queue and we're done */if (!do_sb_sort) {list_splice(&tmp, dispatch_queue);goto out;}/* Move inodes from one superblock together */while (!list_empty(&tmp)) {sb = wb_inode(tmp.prev)->i_sb;list_for_each_prev_safe(pos, node, &tmp) {inode = wb_inode(pos);if (inode->i_sb == sb)list_move(&inode->i_wb_list, dispatch_queue);}}
out:return moved;
}