整体写流程如下

  1. write函数主要分为三步:获取page cache,拷贝数据到page cache,设置inode为脏将入b_dirty;
  2. 每次写完都要进行bdi dirty 平衡判断,如果需要回写,则唤醒回写线程进行真正的回写;

函数注释如下

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,size_t, count)
{struct fd f = fdget_pos(fd);ssize_t ret = -EBADF;if (f.file) {loff_t pos = file_pos_read(f.file);ret = vfs_write(f.file, buf, count, &pos);if (ret >= 0)file_pos_write(f.file, pos);fdput_pos(f);}return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{ssize_t ret;if (!(file->f_mode & FMODE_WRITE))return -EBADF;if (!(file->f_mode & FMODE_CAN_WRITE))return -EINVAL;if (unlikely(!access_ok(VERIFY_READ, buf, count)))return -EFAULT;ret = rw_verify_area(WRITE, file, pos, count);if (ret >= 0) {count = ret;file_start_write(file);if (file->f_op->write)ret = file->f_op->write(file, buf, count, pos);else if (file->f_op->aio_write)ret = do_sync_write(file, buf, count, pos);elseret = new_sync_write(file, buf, count, pos);if (ret > 0) {fsnotify_modify(file);add_wchar(current, ret);}inc_syscw(current);file_end_write(file);}return ret;
}
ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };struct kiocb kiocb;struct iov_iter iter;ssize_t ret;init_sync_kiocb(&kiocb, filp);kiocb.ki_pos = *ppos;kiocb.ki_nbytes = len;iov_iter_init(&iter, WRITE, &iov, 1, len);ret = filp->f_op->write_iter(&kiocb, &iter);if (-EIOCBQUEUED == ret)ret = wait_on_sync_kiocb(&kiocb);*ppos = kiocb.ki_pos;return ret;
}
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{struct file *file = iocb->ki_filp;struct inode *inode = file->f_mapping->host;ssize_t ret;mutex_lock(&inode->i_mutex);ret = __generic_file_write_iter(iocb, from);mutex_unlock(&inode->i_mutex);if (ret > 0) {ssize_t err;err = generic_write_sync(file, iocb->ki_pos - ret, ret);if (err < 0)ret = err;}return ret;
}
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{struct file *file = iocb->ki_filp;struct address_space * mapping = file->f_mapping;struct inode    *inode = mapping->host;loff_t       pos = iocb->ki_pos;ssize_t      written = 0;ssize_t        err;ssize_t     status;size_t       count = iov_iter_count(from);/* We can write back this queue in page reclaim */current->backing_dev_info = inode_to_bdi(inode);err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));if (err)goto out;if (count == 0)goto out;iov_iter_truncate(from, count);err = file_remove_suid(file);if (err)goto out;/* 更新文件时间 */err = file_update_time(file);if (err)goto out;/* 直接读写 */if (io_is_direct(file)) {loff_t endbyte;written = generic_file_direct_write(iocb, from, pos);/** If the write stopped short of completing, fall back to* buffered writes.  Some filesystems do this for writes to* holes, for example.  For DAX files, a buffered write will* not succeed (even if it did, DAX does not handle dirty* page-cache pages correctly).*/if (written < 0 || written == count || IS_DAX(inode))goto out;pos += written;count -= written;status = generic_perform_write(file, from, pos);/** If generic_perform_write() returned a synchronous error* then we want to return the number of bytes which were* direct-written, or the error code if that was zero.  Note* that this differs from normal direct-io semantics, which* will return -EFOO even if some bytes were written.*/if (unlikely(status < 0)) {err = status;goto out;}iocb->ki_pos = pos + status;/** We need to ensure that the page cache pages are written to* disk and invalidated to preserve the expected O_DIRECT* semantics.*/endbyte = pos + status - 1;err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);if (err == 0) {written += status;invalidate_mapping_pages(mapping,pos >> PAGE_CACHE_SHIFT,endbyte >> PAGE_CACHE_SHIFT);} else {/** We don't know how much we wrote, so just return* the number of bytes which were direct-written*/}} else {/* page cache */written = generic_perform_write(file, from, pos);if (likely(written >= 0))iocb->ki_pos = pos + written;}
out:current->backing_dev_info = NULL;return written ? written : err;
}
ssize_t generic_perform_write(struct file *file,struct iov_iter *i, loff_t pos)
{struct address_space *mapping = file->f_mapping;const struct address_space_operations *a_ops = mapping->a_ops;long status = 0;ssize_t written = 0;unsigned int flags = 0;/** Copies from kernel address space cannot fail (NFSD is a big user).*/if (!iter_is_iovec(i))flags |= AOP_FLAG_UNINTERRUPTIBLE;do {struct page *page;unsigned long offset;   /* Offset into pagecache page */unsigned long bytes;    /* Bytes to write to page */size_t copied;      /* Bytes copied from user */void *fsdata;offset = (pos & (PAGE_CACHE_SIZE - 1));bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,iov_iter_count(i));again:/** Bring in the user page that we will copy from _first_.* Otherwise there's a nasty deadlock on copying from the* same page as we're writing to, without it being marked* up-to-date.** Not only is this an optimisation, but it is also required* to check that the address is actually valid, when atomic* usercopies are used, below.*/if (unlikely(iov_iter_fault_in_readable(i, bytes))) {status = -EFAULT;break;}/* 1: 准备开始写,输入参数pos、bytes、page */status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata);if (unlikely(status < 0))break;if (mapping_writably_mapped(mapping))flush_dcache_page(page);/*2: 写 page cache */copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);flush_dcache_page(page);/* 3: 结束写 */status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata);if (unlikely(status < 0))break;copied = status;cond_resched();iov_iter_advance(i, copied);if (unlikely(copied == 0)) {/** If we were unable to copy any data at all, we must* fall back to a single segment length write.** If we didn't fallback here, we could livelock* because not all segments in the iov can be copied at* once without a pagefault.*/bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,iov_iter_single_seg_count(i));goto again;}pos += copied;written += copied;/* 4: 是否需要回写 */balance_dirty_pages_ratelimited(mapping);if (fatal_signal_pending(current)) {status = -EINTR;break;}} while (iov_iter_count(i));return written ? written : status;
}

/** balance_dirty_pages() must be called by processes which are generating dirty* data.  It looks at the number of dirty pages in the machine and will force* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.* If we're over `background_thresh' then the writeback threads are woken to* perform some writeout.*/
static void balance_dirty_pages(struct address_space *mapping,unsigned long pages_dirtied)
{unsigned long nr_reclaimable;  /* = file_dirty + unstable_nfs */unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */unsigned long background_thresh;unsigned long dirty_thresh;long period;long pause;long max_pause;long min_pause;int nr_dirtied_pause;bool dirty_exceeded = false;unsigned long task_ratelimit;unsigned long dirty_ratelimit;unsigned long pos_ratio;struct backing_dev_info *bdi = inode_to_bdi(mapping->host);bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; //单独门限值回收unsigned long start_time = jiffies;for (;;) {unsigned long now = jiffies;unsigned long uninitialized_var(bdi_thresh);unsigned long thresh;unsigned long uninitialized_var(bdi_dirty);unsigned long dirty;unsigned long bg_thresh;/** Unstable writes are a feature of certain networked* filesystems (i.e. NFS) in which data may have been* written to the server's write cache, but has not yet* been flushed to permanent storage.*/nr_reclaimable = global_page_state(NR_FILE_DIRTY) +global_page_state(NR_UNSTABLE_NFS);  /* 全局 文件脏页  + 网络文件系统 */nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); /*全局 文件总的脏页+包括正在回写 */global_dirty_limits(&background_thresh, &dirty_thresh);//获取两个门限值if (unlikely(strictlimit)) {  /* 单独bdi回收 */bdi_dirty_limits(bdi, dirty_thresh, background_thresh,&bdi_dirty, &bdi_thresh, &bg_thresh);dirty = bdi_dirty;thresh = bdi_thresh;} else {                       /* 全局回收 */dirty = nr_dirty;          /* 全局 文件总的脏页+包括正在回写 */thresh = dirty_thresh;bg_thresh = background_thresh;}/** Throttle it only when the background writeback cannot* catch-up. This avoids (excessively) small writeouts* when the bdi limits are ramping up in case of !strictlimit.** In strictlimit case make decision based on the bdi counters* and limits. Small writeouts when the bdi limits are ramping* up are the price we consciously pay for strictlimit-ing.*/if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {  //(thresh + bg_thresh) / 2; 不回收current->dirty_paused_when = now;current->nr_dirtied = 0;   /* 脏页数量重新置0 */current->nr_dirtied_pause =dirty_poll_interval(dirty, thresh);break;}if (unlikely(!writeback_in_progress(bdi)))  /* 唤醒真正的回写线程 */bdi_start_background_writeback(bdi);if (!strictlimit)bdi_dirty_limits(bdi, dirty_thresh, background_thresh,&bdi_dirty, &bdi_thresh, NULL);//nr_dirty > dirty_threshdirty_exceeded = (bdi_dirty > bdi_thresh) &&((nr_dirty > dirty_thresh) || strictlimit); //超过门限if (dirty_exceeded && !bdi->dirty_exceeded)bdi->dirty_exceeded = 1;                        //超过门限,后面需要加速回收bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,nr_dirty, bdi_thresh, bdi_dirty,start_time);dirty_ratelimit = bdi->dirty_ratelimit;pos_ratio = bdi_position_ratio(bdi, dirty_thresh,background_thresh, nr_dirty,bdi_thresh, bdi_dirty);task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>RATELIMIT_CALC_SHIFT;max_pause = bdi_max_pause(bdi, bdi_dirty);min_pause = bdi_min_pause(bdi, max_pause,task_ratelimit, dirty_ratelimit,&nr_dirtied_pause);if (unlikely(task_ratelimit == 0)) {period = max_pause;pause = max_pause;goto pause;}period = HZ * pages_dirtied / task_ratelimit;pause = period;if (current->dirty_paused_when)pause -= now - current->dirty_paused_when;/** For less than 1s think time (ext3/4 may block the dirtier* for up to 800ms from time to time on 1-HDD; so does xfs,* however at much less frequency), try to compensate it in* future periods by updating the virtual time; otherwise just* do a reset, as it may be a light dirtier.*/if (pause < min_pause) {trace_balance_dirty_pages(bdi,dirty_thresh,background_thresh,nr_dirty,bdi_thresh,bdi_dirty,dirty_ratelimit,task_ratelimit,pages_dirtied,period,min(pause, 0L),start_time);if (pause < -HZ) {current->dirty_paused_when = now;current->nr_dirtied = 0;} else if (period) {current->dirty_paused_when += period;current->nr_dirtied = 0;} else if (current->nr_dirtied_pause <= pages_dirtied)current->nr_dirtied_pause += pages_dirtied;break;}if (unlikely(pause > max_pause)) {/* for occasional dropped task_ratelimit */now += min(pause - max_pause, max_pause);pause = max_pause;}pause:trace_balance_dirty_pages(bdi,dirty_thresh,background_thresh,nr_dirty,bdi_thresh,bdi_dirty,dirty_ratelimit,task_ratelimit,pages_dirtied,period,pause,start_time);__set_current_state(TASK_KILLABLE);io_schedule_timeout(pause);//有可能会切出去,但最大超过200mscurrent->dirty_paused_when = now + pause;current->nr_dirtied = 0;current->nr_dirtied_pause = nr_dirtied_pause;/** This is typically equal to (nr_dirty < dirty_thresh) and can* also keep "1000+ dd on a slow USB stick" under control.*/if (task_ratelimit)break;/** In the case of an unresponding NFS server and the NFS dirty* pages exceeds dirty_thresh, give the other good bdi's a pipe* to go through, so that tasks on them still remain responsive.** In theory 1 page is enough to keep the comsumer-producer* pipe going: the flusher cleans 1 page => the task dirties 1* more page. However bdi_dirty has accounting errors.  So use* the larger and more IO friendly bdi_stat_error.*/if (bdi_dirty <= bdi_stat_error(bdi))break;if (fatal_signal_pending(current))break;}if (!dirty_exceeded && bdi->dirty_exceeded)  //如果不超过门限,则置0bdi->dirty_exceeded = 0;if (writeback_in_progress(bdi))  //正在回收,则退出return;/** In laptop mode, we wait until hitting the higher threshold before* starting background writeout, and then write out all the way down* to the lower threshold.  So slow writers cause minimal disk activity.** In normal mode, we start background writeout at the lower* background_thresh, to keep the amount of dirty memory low.*/if (laptop_mode)return;if (nr_reclaimable > background_thresh) //可回收的页面大于background_thresh,则触发线程异步回收bdi_start_background_writeback(bdi);
}
void bdi_start_background_writeback(struct backing_dev_info *bdi)
{/** We just wake up the flusher thread. It will perform background* writeback as soon as there is no other work to do.*/trace_writeback_wake_background(bdi);bdi_wakeup_thread(bdi);
}

/* 这里并没有wb_writeback_work加入到bdi->work_list 也就是唤醒的线程没有work要处理 */

static void bdi_wakeup_thread(struct backing_dev_info *bdi)
{/* 这里并没有wb_writeback_work加入到bdi->work_list */spin_lock_bh(&bdi->wb_lock);if (test_bit(BDI_registered, &bdi->state))mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);spin_unlock_bh(&bdi->wb_lock);
}

回收处理

主要有三种回收:work回收、周期回收过期数据、超过门限回收;

static long wb_do_writeback(struct bdi_writeback *wb)
{struct backing_dev_info *bdi = wb->bdi;struct wb_writeback_work *work;long wrote = 0;set_bit(BDI_writeback_running, &wb->bdi->state);while ((work = get_next_work_item(bdi)) != NULL) {  //1:work触发回收trace_writeback_exec(bdi, work);wrote += wb_writeback(wb, work);/** Notify the caller of completion if this is a synchronous* work item, otherwise just free it.*/if (work->done)complete(work->done);elsekfree(work);}/** Check for periodic writeback, kupdated() style*/wrote += wb_check_old_data_flush(wb);    //2:过期一定时间的page回收(默认30S)wrote += wb_check_background_flush(wb);  //3:门限回收clear_bit(BDI_writeback_running, &wb->bdi->state);return wrote;
}

先来看wb_check_old_data_flush和wb_check_background_flush的wb_writeback_work设置:
这两个属于系统设置的回收:
wb_check_old_data_flush:超过一定时间的脏页回收,一般为30S,5S判断一次;(dirty_writeback_interval、dirty_expire_interval)
wb_check_background_flush:超过门限进行回收;设置参数为(dirty_background_bytes、dirty_background_ratio)

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{unsigned long expired;long nr_pages;/** When set to zero, disable periodic writeback*/if (!dirty_writeback_interval)return 0;expired = wb->last_old_flush +msecs_to_jiffies(dirty_writeback_interval * 10);if (time_before(jiffies, expired))return 0;/* 更新本次运行的起始时间 */wb->last_old_flush = jiffies;  nr_pages = get_nr_dirty_pages();if (nr_pages) {struct wb_writeback_work work = {.nr_pages = nr_pages,.sync_mode  = WB_SYNC_NONE,.for_kupdate    = 1,.range_cyclic  = 1,.reason        = WB_REASON_PERIODIC,};return wb_writeback(wb, &work);}return 0;
}static long wb_check_background_flush(struct bdi_writeback *wb)
{if (over_bground_thresh(wb->bdi)) {struct wb_writeback_work work = {.nr_pages  = LONG_MAX,  /* 页数设置为无穷, */.sync_mode   = WB_SYNC_NONE,.for_background = 1, .range_cyclic = 1,.reason        = WB_REASON_BACKGROUND,};return wb_writeback(wb, &work);}return 0;
}
static long wb_writeback(struct bdi_writeback *wb,struct wb_writeback_work *work)
{unsigned long wb_start = jiffies;long nr_pages = work->nr_pages;unsigned long oldest_jif;struct inode *inode;long progress;oldest_jif = jiffies;work->older_than_this = &oldest_jif; //指针赋值spin_lock(&wb->list_lock);/* * 结束的条件为回写了nr_pages(达到了回收的目标),或者回收完了* wb_check_background_flush进入时nr_pages设置为无穷,退出循环为达到门限以下; */for (;;) {/** Stop writeback when nr_pages has been consumed*/if (work->nr_pages <= 0)break;/** Background writeout and kupdate-style writeback may* run forever. Stop them if there is other work to do* so that e.g. sync can proceed. They'll be restarted* after the other works are all done.*//* 优先处理work回收,门限回收和周期回收优先级低 */if ((work->for_background || work->for_kupdate) &&!list_empty(&wb->bdi->work_list))break;/** For background writeout, stop when we are below the* background dirty threshold*//* 背景回收先判断是否超过门限,如果在门限以下则退出 */if (work->for_background && !over_bground_thresh(wb->bdi)) break;/** Kupdate and background works are special and we want to* include all inodes that need writing. Livelock avoidance is* handled by these works yielding to any other work so we are* safe.*/if (work->for_kupdate) {oldest_jif = jiffies -msecs_to_jiffies(dirty_expire_interval * 10); /* 过期的数据需要回写,默认30S之前。在queue_io中使用 */} else if (work->for_background)oldest_jif = jiffies;                             /*  */trace_writeback_start(wb->bdi, work);if (list_empty(&wb->b_io))          /* 如果bio队列为空,需要从more_io和dirty_io中拿一些过来 */queue_io(wb, work);if (work->sb)progress = writeback_sb_inodes(work->sb, wb, work);  /* 一般为这个分支 */elseprogress = __writeback_inodes_wb(wb, work);trace_writeback_written(wb->bdi, work);wb_update_bandwidth(wb, wb_start);/** Did we write something? Try for more** Dirty inodes are moved to b_io for writeback in batches.* The completion of the current batch does not necessarily* mean the overall work is done. So we keep looping as long* as made some progress on cleaning pages or inodes.*//* 如果这次循环触发了写inode,则继续循环判断 */if (progress)continue;/** No more inodes for IO, bail*//* 本次没写且more_io页为空,说明没有可写的了,退出 */if (list_empty(&wb->b_more_io))break;/** Nothing written. Wait for some inode to* become available for writeback. Otherwise* we'll just busyloop.*//* 如果每次没有写且more_io不为空,说明别的地方调用了回写,此处等待回写完成 */if (!list_empty(&wb->b_more_io))  {trace_writeback_wait(wb->bdi, work);inode = wb_inode(wb->b_more_io.prev);spin_lock(&inode->i_lock);spin_unlock(&wb->list_lock);/* This function drops i_lock... */inode_sleep_on_writeback(inode);spin_lock(&wb->list_lock);}}spin_unlock(&wb->list_lock);return nr_pages - work->nr_pages;
}

/** Write a portion of b_io inodes which belong to @sb.** Return the number of pages and/or inodes written.*/
static long writeback_sb_inodes(struct super_block *sb,struct bdi_writeback *wb,struct wb_writeback_work *work)
{struct writeback_control wbc = {.sync_mode        = work->sync_mode,.tagged_writepages    = work->tagged_writepages,.for_kupdate      = work->for_kupdate,.for_background     = work->for_background,.for_sync        = work->for_sync,.range_cyclic      = work->range_cyclic,.range_start       = 0,.range_end     = LLONG_MAX,  /* 全部范围 */};unsigned long start_time = jiffies;long write_chunk;long wrote = 0;  /* count both pages and inodes */while (!list_empty(&wb->b_io)) {struct inode *inode = wb_inode(wb->b_io.prev);/** 1: 两个inode属于不同super block,函数需要返回重新进来* 2:如果work设置了sb,则只回收这个sb的inode,其他继续放在链表b_dirty中*/if (inode->i_sb != sb) {if (work->sb) {/** We only want to write back data for this* superblock, move all inodes not belonging* to it back onto the dirty list.*/redirty_tail(inode, wb);continue;}/** The inode belongs to a different superblock.* Bounce back to the caller to unpin this and* pin the next superblock.*/break;}/** Don't bother with new inodes or inodes being freed, first* kind does not need periodic writeout yet, and for the latter* kind writeout is handled by the freer.*/spin_lock(&inode->i_lock);  /* 自旋锁 *//* 如果该inode刚刚被创建或者即将被销毁,那么将其重新放入dirty_io链表,等待下次被回写 */if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {spin_unlock(&inode->i_lock);redirty_tail(inode, wb);continue;}/* 如果是为了内存回收而写回脏数据,则将其加入到more_io中 */if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {  /* 一般sync_mode设置为WB_SYNC_NONE *//** If this inode is locked for writeback and we are not* doing writeback-for-data-integrity, move it to* b_more_io so that writeback can proceed with the* other inodes on s_io.** We'll have another go at writing back this inode* when we completed a full scan of b_io.*/spin_unlock(&inode->i_lock);requeue_io(inode, wb);                                       /* 正在回收的inode进入more_io中!!!! */trace_writeback_sb_inodes_requeue(inode);continue;}spin_unlock(&wb->list_lock);/** We already requeued the inode if it had I_SYNC set and we* are doing WB_SYNC_NONE writeback. So this catches only the* WB_SYNC_ALL case.*//* 如果是为了数据完整性而进行的数据回写,则等待回写完成,写完之后会唤醒 */if (inode->i_state & I_SYNC) {/* Wait for I_SYNC. This function drops i_lock... */inode_sleep_on_writeback(inode);/* Inode may be gone, start again */spin_lock(&wb->list_lock);continue;}inode->i_state |= I_SYNC;        /* 设置该inode正在回写标志 */spin_unlock(&inode->i_lock);write_chunk = writeback_chunk_size(wb->bdi, work);wbc.nr_to_write = write_chunk;   /* 一般为4096     */wbc.pages_skipped = 0;           /* 先设置为0 *//** We use I_SYNC to pin the inode in memory. While it is set* evict_inode() will wait so the inode cannot be freed.*/__writeback_single_inode(inode, &wbc);  /* 回写 */work->nr_pages -= write_chunk - wbc.nr_to_write;wrote += write_chunk - wbc.nr_to_write;spin_lock(&wb->list_lock);         /* 操作前需要先获得锁 */spin_lock(&inode->i_lock);if (!(inode->i_state & I_DIRTY_ALL))wrote++;requeue_inode(inode, wb, &wbc);    /* 根据状态重新入队 */inode_sync_complete(inode);        /* 清除I_SYNC标志,同时唤醒正在等待inode_sleep_on_writeback */spin_unlock(&inode->i_lock);    cond_resched_lock(&wb->list_lock);/** bail out to wb_writeback() often enough to check* background threshold and other termination conditions.*/if (wrote) {if (time_is_before_jiffies(start_time + HZ / 10UL))break;if (work->nr_pages <= 0)break;}}return wrote;
}
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{struct address_space *mapping = inode->i_mapping;long nr_to_write = wbc->nr_to_write;unsigned dirty;int ret;WARN_ON(!(inode->i_state & I_SYNC));trace_writeback_single_inode_start(inode, wbc, nr_to_write);ret = do_writepages(mapping, wbc);  /* 触发inode写 *//** Make sure to wait on the data before writing out the metadata.* This is important for filesystems that modify metadata on data* I/O completion. We don't do it for sync(2) writeback because it has a* separate, external IO completion path and ->sync_fs for guaranteeing* inode metadata is written back correctly.*/if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {int err = filemap_fdatawait(mapping);      /* 等待回写完成 */if (ret == 0)ret = err;}/** Some filesystems may redirty the inode during the writeback* due to delalloc, clear dirty metadata flags right before* write_inode()*/spin_lock(&inode->i_lock);dirty = inode->i_state & I_DIRTY;if (inode->i_state & I_DIRTY_TIME) {if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||unlikely(time_after(jiffies,(inode->dirtied_time_when +dirtytime_expire_interval * HZ)))) {dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;trace_writeback_lazytime(inode);}} elseinode->i_state &= ~I_DIRTY_TIME_EXPIRED;inode->i_state &= ~dirty;/** Paired with smp_mb() in __mark_inode_dirty().  This allows* __mark_inode_dirty() to test i_state without grabbing i_lock -* either they see the I_DIRTY bits cleared or we see the dirtied* inode.** I_DIRTY_PAGES is always cleared together above even if @mapping* still has dirty pages.  The flag is reinstated after smp_mb() if* necessary.  This guarantees that either __mark_inode_dirty()* sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.*/smp_mb();if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))inode->i_state |= I_DIRTY_PAGES;spin_unlock(&inode->i_lock);if (dirty & I_DIRTY_TIME)mark_inode_dirty_sync(inode);/* Don't write the inode if only I_DIRTY_PAGES was set *//* 如果只有文件数据修改,则不需要回写inode,否则要回写inode */if (dirty & ~I_DIRTY_PAGES) {int err = write_inode(inode, wbc);  /* write_inode是一个同步函数 */if (ret == 0)ret = err;}trace_writeback_single_inode(inode, wbc, nr_to_write);return ret;
}

问题一:
dirty_writeback_interval、dirty_writeback_interval、两个参数怎么起作用?
dirty_writeback_interval作用在wb_check_old_data_flush,如果时间未到,不触发回收;
dirty_writeback_interval作用在move_expired_inodes只有超时的inode才移动到b_io中进行回收


static long wb_check_old_data_flush(struct bdi_writeback *wb)
{unsigned long expired;long nr_pages;/** When set to zero, disable periodic writeback*/if (!dirty_writeback_interval)return 0;expired = wb->last_old_flush +msecs_to_jiffies(dirty_writeback_interval * 10);if (time_before(jiffies, expired))return 0;...return 0;
}在static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{int moved;assert_spin_locked(&wb->list_lock);list_splice_init(&wb->b_more_io, &wb->b_io);                   //more_io 无条件移动到 wb_iomoved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); //移动超时的节点b_dirty->b_iomoved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,EXPIRE_DIRTY_ATIME, work);trace_writeback_queue_io(wb, work, moved);
}/** Move expired (dirtied before work->older_than_this) dirty inodes from* @delaying_queue to @dispatch_queue.*/
static int move_expired_inodes(struct list_head *delaying_queue,struct list_head *dispatch_queue,int flags,struct wb_writeback_work *work)
{unsigned long *older_than_this = NULL;unsigned long expire_time;LIST_HEAD(tmp);struct list_head *pos, *node;struct super_block *sb = NULL;struct inode *inode;int do_sb_sort = 0;int moved = 0;/* * 对于wb_check_old_data_flush来说,older_than_this = work->older_than_this;* 对于wb_check_background_flush来说,for_sync=0,expire_time = jiffies - (dirtytime_expire_interval * HZ)* older_than_this 为空的话,则不比较时间,所有都移动。什么情况下???*/if ((flags & EXPIRE_DIRTY_ATIME) == 0)          /* 使用传入进来的参数 */older_than_this = work->older_than_this;else if (!work->for_sync) {                     /* 如果没有强制刷新标志,则设置时间为默认30S之前 */expire_time = jiffies - (dirtytime_expire_interval * HZ);older_than_this = &expire_time;          }                                               /* 其他的情况,不比较时间,都移动 */while (!list_empty(delaying_queue)) {inode = wb_inode(delaying_queue->prev);/* 如果当前节点没有过期的话,则后面的节点肯定没有过期,则直接退出即可 */if (older_than_this &&inode_dirtied_after(inode, *older_than_this))break;list_move(&inode->i_wb_list, &tmp);moved++;if (flags & EXPIRE_DIRTY_ATIME)set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);if (sb_is_blkdev_sb(inode->i_sb))continue;if (sb && sb != inode->i_sb)do_sb_sort = 1;sb = inode->i_sb;}/* just one sb in list, splice to dispatch_queue and we're done */if (!do_sb_sort) {list_splice(&tmp, dispatch_queue);goto out;}/* Move inodes from one superblock together */while (!list_empty(&tmp)) {sb = wb_inode(tmp.prev)->i_sb;list_for_each_prev_safe(pos, node, &tmp) {inode = wb_inode(pos);if (inode->i_sb == sb)list_move(&inode->i_wb_list, dispatch_queue);}}
out:return moved;
}

Linux 文件系统写-ext2流程相关推荐

  1. Linux 文件系统及 ext2 文件系统

    linux 支持的文件系统类型 Ext2:     有点像 UNIX 文件系统.有 blocks,inodes,directories 的概念. Ext3:     Ext2 的加强版,添加了日志的功 ...

  2. Ext2, Ext3和Ext4是什么以及如何创建和转换Linux文件系统

    按照这里的指导,任何人能够转换它们的文件系统,但因为以下任务需要高级的管理规范,在做这之前,确保你必须做了重要的备份.如果遇到任何错误,至少你能够用备份数据恢复. 在计算机中,文件系统是命名并且逻辑放 ...

  3. ext显示服务器文件,使用linux的fsck.ext2命令检查及修复ext2文件系统

    使用linux的fsck.ext2命令检查及修复ext2文件系统 发布时间:2020-07-18 13:40:16 来源:亿速云 阅读:138 作者:清晨 栏目:服务器 不懂使用linux的fsck. ...

  4. linux文件系统什么格式转换,什么是Ext2,Ext3和Ext4以及如何创建和转换Linux文件系统...

    我用我的Fedora旧系统进行测试,我从ext2转换为ext3,EXT2 到ext4和ext3到ext4文件系统成功. 按照本指南任何人都可以巧妙地转换自己的文件系统,但我仍然想提醒你将在此之前,因为 ...

  5. Linux文件系统IO:直接IO原理与实现:缓存I/O、直接I/O

    目录 缓存I/O 缓存I/O的优缺点 直接I/O 直接I/O实现 - direct_IO(), brw_kiovec() 推荐阅读 缓存I/O 一般来说,当调用 open() 系统调用打开文件时,如果 ...

  6. Linux 文件系统原理 / 虚拟文件系统VFS

    Linux 文件系统原理 / 虚拟文件系统VFS 虚拟文件系统 VFS VFS 定义 VFS 的对象演绎 超级块 super_block 索引节点 inode 目录项 dentry 文件 file 文 ...

  7. 深入理解Linux文件系统之文件系统挂载(下)

    接着: 深入理解Linux文件系统之文件系统挂载(上) 本文为文件系统挂载专题文章的第二篇,主要介绍如何通过挂载实例关联挂载点和超级块并添加到全局文件系统树. 4. 添加到全局文件系统树 4.1 do ...

  8. Linux 文件系统(VFS、EXT、proc)

    主要参考了<深入linux内核>和<Linux内核深度解析>,另外简单浅析了一下相关内容 文章目录 通用文件模型及VfS文件结构 基础知识 文件系统种类 常见的文件系统 VFS ...

  9. linux文件系统——磁盘分区、Ext格式、文件属性与权限

    目录 一.磁盘 1. 机械磁盘组成 2. 磁盘逻辑结构 3. 磁盘分区 4. 分区与文件系统 二.文件系统 1. 目录树 2. Ext2文件系统(inode) 3. inode与目录树的关系 4. 文 ...

最新文章

  1. 电子秤专用模拟/数字(A/D)转换器芯片 HX711
  2. java5的递归算法_Java基础入门(五)之方法以及递归算法
  3. hadoop 查看节点主备装填_基于ZooKeeper搭建Hadoop高可用集群
  4. 复习之 c实现 通讯录
  5. Nginx——事件驱动机制(雷霆追风问题,负载均衡)
  6. python接口 同花顺_利用python探索股票市场数据指南
  7. 啊!多么痛的领悟···
  8. 如何腾出计算机内存,win7系统(取消)删除虚拟内存让硬盘空间轻松腾出来
  9. 字符串匹配BF/RK/BM/KMP算法
  10. 数据结构之二叉排序树
  11. 堪称神器的命令行工具系列——curl
  12. layui table 表格两种赋值方式下,data分页效果有效, url分页效果的失效 问题的解决。
  13. Java中parseInt()和valueOf(),toString()的区别
  14. 极域电子教室破解还原卡
  15. 宏病毒专杀软件测试大乐,推荐几个宏病毒专杀工具
  16. 转载:技术大停滞——范式春梦中的地球工业文明2:科技利益集团鼓吹的范式春梦—所谓的技术大爆炸
  17. 《平成的超级偶像金牌舔狗》之mmdetection,paddle detection安装,demo跑通,训练跑通,保姆级教学
  18. 【机器学习】补完计划
  19. 【小程序“600002“】现象:小程序测试版能正常的进行页面跳转,正式版不能进行页面跳转
  20. S7-200SMART PLC的IP更改方法

热门文章

  1. ORB-SLAM3 土味分析
  2. 用html5颜色自动渐变,HTML5渐变背景色生成器
  3. 光明区科技型中小企业投融资奖励申报条件及材料,补贴300万
  4. mysql 伪表查询语句_MySQL数据库之select查询语句
  5. hive中json格式字段解析及map使用
  6. Python-多元线性回归方程比较最小二乘法与梯度下降法
  7. 有源RFID技术部署,需考虑哪些因素?
  8. WinForm条码打印
  9. MySql深分页问题详解
  10. 霍尼韦尔Honeywell MS5145和MS9540条码扫描枪如何设置成自动扫描?(长亮)