Merge 4.9.51 into android-4.9

Changes in 4.9.51
	ipv6: accept 64k - 1 packet length in ip6_find_1stfragopt()
	ipv6: add rcu grace period before freeing fib6_node
	ipv6: fix sparse warning on rt6i_node
	macsec: add genl family module alias
	udp: on peeking bad csum, drop packets even if not at head
	fsl/man: Inherit parent device and of_node
	sctp: Avoid out-of-bounds reads from address storage
	qlge: avoid memcpy buffer overflow
	netvsc: fix deadlock betwen link status and removal
	cxgb4: Fix stack out-of-bounds read due to wrong size to t4_record_mbox()
	packet: Don't write vnet header beyond end of buffer
	kcm: do not attach PF_KCM sockets to avoid deadlock
	Revert "net: phy: Correctly process PHY_HALTED in phy_stop_machine()"
	tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0
	mlxsw: spectrum: Forbid linking to devices that have uppers
	bridge: switchdev: Clear forward mark when transmitting packet
	Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
	Revert "net: fix percpu memory leaks"
	gianfar: Fix Tx flow control deactivation
	vhost_net: correctly check tx avail during rx busy polling
	ip6_gre: update mtu properly in ip6gre_err
	ipv6: fix memory leak with multiple tables during netns destruction
	ipv6: fix typo in fib6_net_exit()
	sctp: fix missing wake ups in some situations
	ip_tunnel: fix setting ttl and tos value in collect_md mode
	f2fs: let fill_super handle roll-forward errors
	f2fs: check hot_data for roll-forward recovery
	x86/fsgsbase/64: Fully initialize FS and GS state in start_thread_common
	x86/fsgsbase/64: Report FSBASE and GSBASE correctly in core dumps
	x86/switch_to/64: Rewrite FS/GS switching yet again to fix AMD CPUs
	xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff()
	xfs: fix spurious spin_is_locked() assert failures on non-smp kernels
	xfs: push buffer of flush locked dquot to avoid quotacheck deadlock
	xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent
	xfs: release bli from transaction properly on fs shutdown
	xfs: remove bli from AIL before release on transaction abort
	xfs: don't allow bmap on rt files
	xfs: free uncommitted transactions during log recovery
	xfs: free cowblocks and retry on buffered write ENOSPC
	xfs: don't crash on unexpected holes in dir/attr btrees
	xfs: check _btree_check_block value
	xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write
	xfs: check _alloc_read_agf buffer pointer before using
	xfs: fix quotacheck dquot id overflow infinite loop
	xfs: fix multi-AG deadlock in xfs_bunmapi
	xfs: Fix per-inode DAX flag inheritance
	xfs: fix inobt inode allocation search optimization
	xfs: clear MS_ACTIVE after finishing log recovery
	xfs: don't leak quotacheck dquots when cow recovery
	iomap: fix integer truncation issues in the zeroing and dirtying helpers
	xfs: write unmount record for ro mounts
	xfs: toggle readonly state around xfs_log_mount_finish
	xfs: remove xfs_trans_ail_delete_bulk
	xfs: Add infrastructure needed for error propagation during buffer IO failure
	xfs: Properly retry failed inode items in case of error during buffer writeback
	xfs: fix recovery failure when log record header wraps log end
	xfs: always verify the log tail during recovery
	xfs: fix log recovery corruption error due to tail overwrite
	xfs: handle -EFSCORRUPTED during head/tail verification
	xfs: add log recovery tracepoint for head/tail
	xfs: stop searching for free slots in an inode chunk when there are none
	xfs: evict all inodes involved with log redo item
	xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster()
	xfs: open-code xfs_buf_item_dirty()
	xfs: remove unnecessary dirty bli format check for ordered bufs
	xfs: ordered buffer log items are never formatted
	xfs: refactor buffer logging into buffer dirtying helper
	xfs: don't log dirty ranges for ordered buffers
	xfs: skip bmbt block ino validation during owner change
	xfs: move bmbt owner change to last step of extent swap
	xfs: disallow marking previously dirty buffers as ordered
	xfs: relog dirty buffers during swapext bmbt owner change
	xfs: disable per-inode DAX flag
	xfs: fix incorrect log_flushed on fsync
	xfs: don't set v3 xflags for v2 inodes
	xfs: open code end_buffer_async_write in xfs_finish_page_writeback
	xfs: use kmem_free to free return value of kmem_zalloc
	md/raid5: release/flush io in raid5_do_work()
	xfs: fix compiler warnings
	ipv6: Fix may be used uninitialized warning in rt6_check
	Linux 4.9.51

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman
2017-09-20 09:59:51 +02:00
79 changed files with 1251 additions and 674 deletions

View File

@@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 9
SUBLEVEL = 50
SUBLEVEL = 51
EXTRAVERSION =
NAME = Roaring Lionus

View File

@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
@@ -226,8 +227,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
(pr_reg)[21] = current->thread.fsbase; \
(pr_reg)[22] = current->thread.gsbase; \
rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \

View File

@@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task)
}
}
enum which_selector {
FS,
GS
};
/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
* is hot.
*/
static __always_inline void save_base_legacy(struct task_struct *prev_p,
unsigned short selector,
enum which_selector which)
{
if (likely(selector == 0)) {
/*
* On Intel (without X86_BUG_NULL_SEG), the segment base could
* be the pre-existing saved base or it could be zero. On AMD
* (with X86_BUG_NULL_SEG), the segment base could be almost
* anything.
*
* This branch is very hot (it's hit twice on almost every
* context switch between 64-bit programs), and avoiding
* the RDMSR helps a lot, so we just assume that whatever
* value is already saved is correct. This matches historical
* Linux behavior, so it won't break existing applications.
*
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
* report that the base is zero, it needs to actually be zero:
* see the corresponding logic in load_seg_legacy.
*/
} else {
/*
* If the selector is 1, 2, or 3, then the base is zero on
* !X86_BUG_NULL_SEG CPUs and could be anything on
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
* has never attempted to preserve the base across context
* switches.
*
* If selector > 3, then it refers to a real segment, and
* saving the base isn't necessary.
*/
if (which == FS)
prev_p->thread.fsbase = 0;
else
prev_p->thread.gsbase = 0;
}
}
static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
save_base_legacy(task, task->thread.fsindex, FS);
save_base_legacy(task, task->thread.gsindex, GS);
}
static __always_inline void loadseg(enum which_selector which,
unsigned short sel)
{
if (which == FS)
loadsegment(fs, sel);
else
load_gs_index(sel);
}
static __always_inline void load_seg_legacy(unsigned short prev_index,
unsigned long prev_base,
unsigned short next_index,
unsigned long next_base,
enum which_selector which)
{
if (likely(next_index <= 3)) {
/*
* The next task is using 64-bit TLS, is not using this
* segment at all, or is having fun with arcane CPU features.
*/
if (next_base == 0) {
/*
* Nasty case: on AMD CPUs, we need to forcibly zero
* the base.
*/
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
loadseg(which, __USER_DS);
loadseg(which, next_index);
} else {
/*
* We could try to exhaustively detect cases
* under which we can skip the segment load,
* but there's really only one case that matters
* for performance: if both the previous and
* next states are fully zeroed, we can skip
* the load.
*
* (This assumes that prev_base == 0 has no
* false positives. This is the case on
* Intel-style CPUs.)
*/
if (likely(prev_index | next_index | prev_base))
loadseg(which, next_index);
}
} else {
if (prev_index != next_index)
loadseg(which, next_index);
wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
next_base);
}
} else {
/*
* The next task is using a real segment. Loading the selector
* is sufficient.
*/
loadseg(which, next_index);
}
}
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
WARN_ON_ONCE(regs != current_pt_regs());
if (static_cpu_has(X86_BUG_NULL_SEG)) {
/* Loading zero below won't clear the base. */
loadsegment(fs, __USER_DS);
load_gs_index(__USER_DS);
}
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
regs->cs = _cs;
@@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
fpu_switch_t fpu_switch;
fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
@@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*
* (e.g. xen_load_tls())
*/
savesegment(fs, prev_fsindex);
savesegment(gs, prev_gsindex);
save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
@@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
/*
* Switch FS and GS.
*
* These are even more complicated than DS and ES: they have
* 64-bit bases are that controlled by arch_prctl. The bases
* don't necessarily match the selectors, as user code can do
* any number of things to cause them to be inconsistent.
*
* We don't promise to preserve the bases if the selectors are
* nonzero. We also don't promise to preserve the base if the
* selector is zero and the base doesn't match whatever was
* most recently passed to ARCH_SET_FS/GS. (If/when the
* FSGSBASE instructions are enabled, we'll need to offer
* stronger guarantees.)
*
* As an invariant,
* (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
* impossible.
*/
if (next->fsindex) {
/* Loading a nonzero value into FS sets the index and base. */
loadsegment(fs, next->fsindex);
} else {
if (next->fsbase) {
/* Next index is zero but next base is nonzero. */
if (prev_fsindex)
loadsegment(fs, 0);
wrmsrl(MSR_FS_BASE, next->fsbase);
} else {
/* Next base and index are both zero. */
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
/*
* We don't know the previous base and can't
* find out without RDMSR. Forcibly clear it.
*/
loadsegment(fs, __USER_DS);
loadsegment(fs, 0);
} else {
/*
* If the previous index is zero and ARCH_SET_FS
* didn't change the base, then the base is
* also zero and we don't need to do anything.
*/
if (prev->fsbase || prev_fsindex)
loadsegment(fs, 0);
}
}
}
/*
* Save the old state and preserve the invariant.
* NB: if prev_fsindex == 0, then we can't reliably learn the base
* without RDMSR because Intel user code can zero it without telling
* us and AMD user code can program any 32-bit value without telling
* us.
*/
if (prev_fsindex)
prev->fsbase = 0;
prev->fsindex = prev_fsindex;
if (next->gsindex) {
/* Loading a nonzero value into GS sets the index and base. */
load_gs_index(next->gsindex);
} else {
if (next->gsbase) {
/* Next index is zero but next base is nonzero. */
if (prev_gsindex)
load_gs_index(0);
wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
} else {
/* Next base and index are both zero. */
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
/*
* We don't know the previous base and can't
* find out without RDMSR. Forcibly clear it.
*
* This contains a pointless SWAPGS pair.
* Fixing it would involve an explicit check
* for Xen or a new pvop.
*/
load_gs_index(__USER_DS);
load_gs_index(0);
} else {
/*
* If the previous index is zero and ARCH_SET_GS
* didn't change the base, then the base is
* also zero and we don't need to do anything.
*/
if (prev->gsbase || prev_gsindex)
load_gs_index(0);
}
}
}
/*
* Save the old state and preserve the invariant.
* NB: if prev_gsindex == 0, then we can't reliably learn the base
* without RDMSR because Intel user code can zero it without telling
* us and AMD user code can program any 32-bit value without telling
* us.
*/
if (prev_gsindex)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;
load_seg_legacy(prev->fsindex, prev->fsbase,
next->fsindex, next->fsbase, FS);
load_seg_legacy(prev->gsindex, prev->gsbase,
next->gsindex, next->gsbase, GS);
switch_fpu_finish(next_fpu, fpu_switch);

View File

@@ -5844,6 +5844,8 @@ static void raid5_do_work(struct work_struct *work)
spin_unlock_irq(&conf->device_lock);
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all();
blk_finish_plug(&plug);

View File

@@ -317,12 +317,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
if (v != MBOX_OWNER_DRV) {
ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
t4_record_mbox(adap, cmd, size, access, ret);
return ret;
}
/* Copy in the new mailbox command and send it on its way ... */
t4_record_mbox(adap, cmd, MBOX_LEN, access, 0);
t4_record_mbox(adap, cmd, size, access, 0);
for (i = 0; i < size; i += 8)
t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
@@ -371,7 +371,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
}
ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
t4_record_mbox(adap, cmd, size, access, ret);
dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
*(const u8 *)cmd, mbox);
t4_report_fw_error(adap);

View File

@@ -622,6 +622,9 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
goto no_mem;
}
pdev->dev.of_node = node;
pdev->dev.parent = priv->dev;
ret = platform_device_add_data(pdev, &data, sizeof(data));
if (ret)
goto err;

View File

@@ -3690,7 +3690,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv)
u32 tempval1 = gfar_read(&regs->maccfg1);
u32 tempval = gfar_read(&regs->maccfg2);
u32 ecntrl = gfar_read(&regs->ecntrl);
u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
if (phydev->duplex != priv->oldduplex) {
if (!(phydev->duplex))

View File

@@ -4172,6 +4172,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
return -EINVAL;
if (!info->linking)
break;
if (netdev_has_any_upper_dev(upper_dev))
return -EINVAL;
/* HW limitation forbids to put ports to multiple bridges. */
if (netif_is_bridge_master(upper_dev) &&
!mlxsw_sp_master_bridge_check(mlxsw_sp, upper_dev))
@@ -4185,6 +4187,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
!netif_is_lag_master(vlan_dev_real_dev(upper_dev)))
return -EINVAL;
if (!info->linking)
break;
if (netdev_has_any_upper_dev(upper_dev))
return -EINVAL;
break;
case NETDEV_CHANGEUPPER:
upper_dev = info->upper_dev;

View File

@@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header(
seg_hdr->cookie = MPI_COREDUMP_COOKIE;
seg_hdr->segNum = seg_number;
seg_hdr->segSize = seg_size;
memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
}
/*

View File

@@ -1084,7 +1084,12 @@ static void netvsc_link_change(struct work_struct *w)
bool notify = false, reschedule = false;
unsigned long flags, next_reconfig, delay;
rtnl_lock();
/* if changes are happening, comeback later */
if (!rtnl_trylock()) {
schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
return;
}
if (ndev_ctx->start_remove)
goto out_unlock;

View File

@@ -3510,6 +3510,7 @@ module_init(macsec_init);
module_exit(macsec_exit);
MODULE_ALIAS_RTNL_LINK("macsec");
MODULE_ALIAS_GENL_FAMILY("macsec");
MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
MODULE_LICENSE("GPL v2");

View File

@@ -674,9 +674,6 @@ void phy_stop_machine(struct phy_device *phydev)
if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
phydev->state = PHY_UP;
mutex_unlock(&phydev->lock);
/* Now we can run the state machine synchronously */
phy_state_machine(&phydev->state_queue.work);
}
/**

View File

@@ -537,8 +537,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
preempt_enable();
if (vhost_enable_notify(&net->dev, vq))
if (!vhost_vq_avail_empty(&net->dev, vq))
vhost_poll_queue(&vq->poll);
else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
vhost_poll_queue(&vq->poll);
}
mutex_unlock(&vq->mutex);
len = peek_head_len(sk);

View File

@@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
@@ -626,8 +626,6 @@ out:
}
clear_sbi_flag(sbi, SBI_POR_DOING);
if (err)
set_ckpt_flags(sbi, CP_ERROR_FLAG);
mutex_unlock(&sbi->cp_mutex);
/* let's drop all the directory inodes for clean checkpoint */

View File

@@ -637,6 +637,7 @@ again:
dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);
/**
* invalidate_inodes - attempt to free all inodes on a superblock

View File

@@ -138,7 +138,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *);
extern void inode_io_list_del(struct inode *inode);
extern long get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
/*

View File

@@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
unsigned long bytes; /* Bytes to write to page */
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
bytes = min_t(loff_t, PAGE_SIZE - offset, length);
rpage = __iomap_read_page(inode, pos);
if (IS_ERR(rpage))
@@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
unsigned offset, bytes;
offset = pos & (PAGE_SIZE - 1); /* Within page */
bytes = min_t(unsigned, PAGE_SIZE - offset, count);
bytes = min_t(loff_t, PAGE_SIZE - offset, count);
if (IS_DAX(inode))
status = iomap_dax_zero(pos, offset, bytes, iomap);

View File

@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
if (!err && tp)
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
return err;
}

View File

@@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
#else
#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
#endif /* DEBUG */
/*
@@ -5555,6 +5555,8 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t max_len;
xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5576,6 +5578,16 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
/*
* Guesstimate how many blocks we can unmap without running the risk of
* blowing out the transaction with a mix of EFIs and reflink
* adjustments.
*/
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
else
max_len = len;
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5621,7 +5633,7 @@ __xfs_bunmapi(
extno = 0;
while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
(nexts == 0 || extno < nexts)) {
(nexts == 0 || extno < nexts) && max_len > 0) {
/*
* Is the found extent after a hole in which bno lives?
* Just back up to the previous extent, if so.
@@ -5647,6 +5659,17 @@ __xfs_bunmapi(
ASSERT(ep != NULL);
del = got;
wasdel = isnullstartblock(del.br_startblock);
/*
* Make sure we don't touch multiple AGF headers out of order
* in a single transaction, as that could cause AB-BA deadlocks.
*/
if (!wasdel) {
agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
if (prev_agno != NULLAGNUMBER && prev_agno > agno)
break;
prev_agno = agno;
}
if (got.br_startoff < start) {
del.br_startoff = start;
del.br_blockcount -= start - got.br_startoff;
@@ -5655,6 +5678,15 @@ __xfs_bunmapi(
}
if (del.br_startoff + del.br_blockcount > bno + 1)
del.br_blockcount = bno + 1 - del.br_startoff;
/* How much can we safely unmap? */
if (max_len < del.br_blockcount) {
del.br_startoff += del.br_blockcount - max_len;
if (!wasdel)
del.br_startblock += del.br_blockcount - max_len;
del.br_blockcount = max_len;
}
sum = del.br_startblock + del.br_blockcount;
if (isrt &&
(mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
@@ -5835,6 +5867,7 @@ __xfs_bunmapi(
if (!isrt && wasdel)
xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
max_len -= del.br_blockcount;
bno = del.br_startoff - 1;
nodelete:
/*
@@ -6604,25 +6637,33 @@ xfs_bmap_finish_one(
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
xfs_filblks_t blockcount,
xfs_filblks_t *blockcount,
xfs_exntst_t state)
{
struct xfs_bmbt_irec bmap;
int nimaps = 1;
xfs_fsblock_t firstfsb;
int flags = XFS_BMAPI_REMAP;
int done;
int error = 0;
bmap.br_startblock = startblock;
bmap.br_startoff = startoff;
bmap.br_blockcount = blockcount;
bmap.br_blockcount = *blockcount;
bmap.br_state = state;
/*
* firstfsb is tied to the transaction lifetime and is used to
* ensure correct AG locking order and schedule work item
* continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
* to only making one bmap call per transaction, so it should
* be safe to have it as a local variable here.
*/
firstfsb = NULLFSBLOCK;
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
ip->i_ino, whichfork, startoff, blockcount, state);
ip->i_ino, whichfork, startoff, *blockcount, state);
if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
return -EFSCORRUPTED;
@@ -6641,12 +6682,11 @@ xfs_bmap_finish_one(
bmap.br_blockcount, flags, &firstfsb,
bmap.br_blockcount, &bmap, &nimaps,
dfops);
*blockcount = 0;
break;
case XFS_BMAP_UNMAP:
error = xfs_bunmapi(tp, ip, bmap.br_startoff,
bmap.br_blockcount, flags, 1, &firstfsb,
dfops, &done);
ASSERT(done);
error = __xfs_bunmapi(tp, ip, startoff, blockcount,
XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
break;
default:
ASSERT(0);

View File

@@ -265,7 +265,7 @@ struct xfs_bmap_intent {
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, enum xfs_bmap_intent_type type,
int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
xfs_filblks_t blockcount, xfs_exntst_t state);
xfs_filblks_t *blockcount, xfs_exntst_t state);
int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,

View File

@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
return -ENOMEM;
cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);

View File

@@ -714,7 +714,8 @@ xfs_btree_firstrec(
* Get the block pointer for this level.
*/
block = xfs_btree_get_block(cur, level, &bp);
xfs_btree_check_block(cur, block, level, bp);
if (xfs_btree_check_block(cur, block, level, bp))
return 0;
/*
* It's empty, there is no such record.
*/
@@ -743,7 +744,8 @@ xfs_btree_lastrec(
* Get the block pointer for this level.
*/
block = xfs_btree_get_block(cur, level, &bp);
xfs_btree_check_block(cur, block, level, bp);
if (xfs_btree_check_block(cur, block, level, bp))
return 0;
/*
* It's empty, there is no such record.
*/
@@ -1772,6 +1774,7 @@ xfs_btree_lookup_get_block(
/* Check the inode owner since the verifiers don't. */
if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
!(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
cur->bc_private.b.ip->i_ino)
@@ -4432,10 +4435,15 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
else
} else {
if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
return 0;
block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
}
/*
* If the block is a root block hosted in an inode, we might not have a
@@ -4444,16 +4452,19 @@ xfs_btree_block_change_owner(
* block is formatted into the on-disk inode fork. We still change it,
* though, so everything is consistent in memory.
*/
if (bp) {
if (cur->bc_tp) {
xfs_trans_ordered_buf(cur->bc_tp, bp);
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
} else {
xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
} else {
if (!bp) {
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
ASSERT(level == cur->bc_nlevels - 1);
return 0;
}
if (cur->bc_tp) {
if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
return -EAGAIN;
}
} else {
xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
return 0;

View File

@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
short forksize; /* fork's inode space */
char whichfork; /* data or attr fork */
char flags; /* flags */
#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
} b;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;

View File

@@ -263,7 +263,7 @@ xfs_da3_node_read(
err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
which_fork, &xfs_da3_node_buf_ops);
if (!err && tp) {
if (!err && tp && *bpp) {
struct xfs_da_blkinfo *info = (*bpp)->b_addr;
int type;

View File

@@ -139,7 +139,7 @@ xfs_dir3_block_read(
err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
if (!err && tp)
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}

View File

@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
if (!err && tp)
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
return err;
}
@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
if (!err && tp)
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
return err;
}

View File

@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
* transaction and pin the log appropriately.
*/
xfs_trans_ordered_buf(tp, fbuf);
xfs_trans_log_buf(tp, fbuf, 0,
BBTOB(fbuf->b_length) - 1);
}
} else {
fbuf->b_flags |= XBF_DONE;
@@ -1123,6 +1121,7 @@ xfs_dialloc_ag_inobt(
int error;
int offset;
int i, j;
int searchdistance = 10;
pag = xfs_perag_get(mp, agno);
@@ -1149,7 +1148,6 @@ xfs_dialloc_ag_inobt(
if (pagno == agno) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
int searchdistance = 10;
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
@@ -1210,21 +1208,9 @@ xfs_dialloc_ag_inobt(
/*
* Loop until we find an inode chunk with a free inode.
*/
while (!doneleft || !doneright) {
while (--searchdistance > 0 && (!doneleft || !doneright)) {
int useleft; /* using left inode chunk this time */
if (!--searchdistance) {
/*
* Not in range - save last search
* location and allocate a new inode
*/
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
goto newino;
}
/* figure out the closer block if both are valid. */
if (!doneleft && !doneright) {
useleft = pagino -
@@ -1236,13 +1222,13 @@ xfs_dialloc_ag_inobt(
/* free inodes to the left? */
if (useleft && trec.ir_freecount) {
rec = trec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
cur = tcur;
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
rec = trec;
goto alloc_inode;
}
@@ -1268,6 +1254,17 @@ xfs_dialloc_ag_inobt(
goto error1;
}
if (searchdistance <= 0) {
/*
* Not in range - save last search
* location and allocate a new inode
*/
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
} else {
/*
* We've reached the end of the btree. because
* we are only searching a small chunk of the
@@ -1282,12 +1279,12 @@ xfs_dialloc_ag_inobt(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
goto restart_pagno;
}
}
/*
* In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
newino:
if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
XFS_LOOKUP_EQ, &i);

View File

@@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
{
int nlists; /* number of irec's (ex lists) */
int size; /* current indirection array size */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
size = nlists * sizeof(xfs_ext_irec_t);
ASSERT(ifp->if_real_bytes);
ASSERT((new_size >= 0) && (new_size != size));
ASSERT((new_size >= 0) &&
(new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
sizeof(xfs_ext_irec_t))));
if (new_size == 0) {
xfs_iext_destroy(ifp);
} else {

View File

@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
}
/*
* While we're adjusting the refcounts records of an extent, we have
* to keep an eye on the number of extents we're dirtying -- run too
* many in a single transaction and we'll exceed the transaction's
* reservation and crash the fs. Each record adds 12 bytes to the
* log (plus any key updates) so we'll conservatively assume 24 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
*
* XXX: This is a pretty hand-wavy estimate. The penalty for guessing
* true incorrectly is a shutdown FS; the penalty for guessing false
* incorrectly is more transaction rolls than might be necessary.
@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
else if (overhead > cur->bc_tp->t_log_res)
return false;
return cur->bc_tp->t_log_res - overhead >
cur->bc_private.a.priv.refc.nr_ops * 32;
cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -1648,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
goto out_trans;
if (!agbp) {
error = -ENOMEM;
goto out_trans;
}
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
/* Find all the leftover CoW staging extents. */

View File

@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
xfs_agnumber_t agno);
/*
* While we're adjusting the refcounts records of an extent, we have
* to keep an eye on the number of extents we're dirtying -- run too
* many in a single transaction and we'll exceed the transaction's
* reservation and crash the fs. Each record adds 12 bytes to the
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
{
return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
}
#endif /* __XFS_REFCOUNT_H__ */

View File

@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
* associated buffer_heads, paying attention to the start and end offsets that
* we need to process on the page.
*
* Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
* buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
* the page at all, as we may be racing with memory reclaim and it can free both
* the bufferhead chain and the page as it will see the page as clean and
* unused.
* Note that we open code the action in end_buffer_async_write here so that we
* only have to iterate over the buffers attached to the page once. This is not
* only more efficient, but also ensures that we only calls end_page_writeback
* at the end of the iteration, and thus avoids the pitfall of having the page
* and buffers potentially freed after every call to end_buffer_async_write.
*/
static void
xfs_finish_page_writeback(
@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
struct bio_vec *bvec,
int error)
{
unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
struct buffer_head *head, *bh, *next;
struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
bool busy = false;
unsigned int off = 0;
unsigned int bsize;
unsigned long flags;
ASSERT(bvec->bv_offset < PAGE_SIZE);
ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
ASSERT(end < PAGE_SIZE);
ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
bh = head = page_buffers(bvec->bv_page);
bsize = bh->b_size;
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
if (off > end)
break;
next = bh->b_this_page;
if (off < bvec->bv_offset)
goto next_bh;
bh->b_end_io(bh, !error);
next_bh:
off += bsize;
} while ((bh = next) != head);
if (off >= bvec->bv_offset &&
off < bvec->bv_offset + bvec->bv_len) {
ASSERT(buffer_async_write(bh));
ASSERT(bh->b_end_io == NULL);
if (error) {
mapping_set_error(bvec->bv_page->mapping, -EIO);
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(bvec->bv_page);
} else {
set_buffer_uptodate(bh);
}
clear_buffer_async_write(bh);
unlock_buffer(bh);
} else if (buffer_async_write(bh)) {
ASSERT(buffer_locked(bh));
busy = true;
}
off += bh->b_size;
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!busy)
end_page_writeback(bvec->bv_page);
}
/*
@@ -138,8 +154,10 @@ xfs_destroy_ioend(
int error)
{
struct inode *inode = ioend->io_inode;
struct bio *last = ioend->io_bio;
struct bio *bio, *next;
struct bio *bio = &ioend->io_inline_bio;
struct bio *last = ioend->io_bio, *next;
u64 start = bio->bi_iter.bi_sector;
bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
@@ -160,6 +178,11 @@ xfs_destroy_ioend(
bio_put(bio);
}
if (unlikely(error && !quiet)) {
xfs_err_ratelimited(XFS_I(inode)->i_mount,
"writeback error on sector %llu", start);
}
}
/*
@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
ASSERT(!buffer_delay(bh));
ASSERT(!buffer_unwritten(bh));
mark_buffer_async_write(bh);
bh->b_end_io = NULL;
set_buffer_async_write(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
}
@@ -1566,9 +1590,12 @@ xfs_vm_bmap(
* The swap code (ab-)uses ->bmap to get a block mapping and then
* bypasseѕ the file system for actual I/O. We really can't allow
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error..
* 0 is the magic code for a bmap error.
*
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
if (xfs_is_reflink_inode(ip)) {
if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return 0;
}

View File

@@ -395,6 +395,7 @@ xfs_bui_recover(
struct xfs_map_extent *bmap;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
xfs_filblks_t count;
bool op_ok;
struct xfs_bud_log_item *budp;
enum xfs_bmap_intent_type type;
@@ -403,6 +404,7 @@ xfs_bui_recover(
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_defer_ops dfops;
struct xfs_bmbt_irec irec;
xfs_fsblock_t firstfsb;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -480,13 +482,24 @@ xfs_bui_recover(
}
xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len;
error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
ip, whichfork, bmap->me_startoff,
bmap->me_startblock, bmap->me_len,
state);
bmap->me_startblock, &count, state);
if (error)
goto err_dfops;
if (count > 0) {
ASSERT(type == XFS_BMAP_UNMAP);
irec.br_startblock = bmap->me_startblock;
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
irec.br_state = state;
error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
if (error)
goto err_dfops;
}
/* Finish transaction, free inodes. */
error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)

View File

@@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
}
/*
* Before we've swapped the forks, lets set the owners of the forks
* appropriately. We have to do this as we are demand paging the btree
* buffers, and so the validation done on read will expect the owner
* field to be correctly set. Once we change the owners, we can swap the
* inode forks.
* Btree format (v3) inodes have the inode number stamped in the bmbt
* block headers. We can't start changing the bmbt blocks until the
* inode owner change is logged so recovery does the right thing in the
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
if (ip->i_d.di_version == 3 &&
ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*target_log_flags) |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
tip->i_ino, NULL);
if (error)
return error;
}
if (tip->i_d.di_version == 3 &&
tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*src_log_flags) |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
ip->i_ino, NULL);
if (error)
return error;
}
/*
* Swap the data forks of the inodes
@@ -1925,6 +1914,48 @@ xfs_swap_extent_forks(
return 0;
}
/*
* Fix up the owners of the bmbt blocks to refer to the current inode. The
* change owner scan attempts to order all modified buffers in the current
* transaction. In the event of ordered buffer failure, the offending buffer is
* physically logged as a fallback and the scan returns -EAGAIN. We must roll
* the transaction in this case to replenish the fallback log reservation and
* restart the scan. This process repeats until the scan completes.
*/
static int
xfs_swap_change_owner(
struct xfs_trans **tpp,
struct xfs_inode *ip,
struct xfs_inode *tmpip)
{
int error;
struct xfs_trans *tp = *tpp;
do {
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
NULL);
/* success or fatal error */
if (error != -EAGAIN)
break;
error = xfs_trans_roll(tpp, NULL);
if (error)
break;
tp = *tpp;
/*
* Redirty both inodes so they can relog and keep the log tail
* moving forward.
*/
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_ijoin(tp, tmpip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
} while (true);
return error;
}
int
xfs_swap_extents(
struct xfs_inode *ip, /* target inode */
@@ -1938,8 +1969,8 @@ xfs_swap_extents(
int error = 0;
int lock_flags;
struct xfs_ifork *cowfp;
__uint64_t f;
int resblks;
uint64_t f;
int resblks = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1987,11 +2018,8 @@ xfs_swap_extents(
XFS_SWAP_RMAP_SPACE_RES(mp,
XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
0, 0, &tp);
} else
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
0, 0, &tp);
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out_unlock;
@@ -2076,6 +2104,23 @@ xfs_swap_extents(
xfs_trans_log_inode(tp, ip, src_log_flags);
xfs_trans_log_inode(tp, tip, target_log_flags);
/*
* The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
* have inode number owner values in the bmbt blocks that still refer to
* the old inode. Scan each bmbt to fix up the owner values with the
* inode number of the current inode.
*/
if (src_log_flags & XFS_ILOG_DOWNER) {
error = xfs_swap_change_owner(&tp, ip, tip);
if (error)
goto out_trans_cancel;
}
if (target_log_flags & XFS_ILOG_DOWNER) {
error = xfs_swap_change_owner(&tp, tip, ip);
if (error)
goto out_trans_cancel;
}
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.

View File

@@ -116,7 +116,7 @@ static inline void
__xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
ASSERT(spin_is_locked(&bp->b_lock));
lockdep_assert_held(&bp->b_lock);
if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
return error;
}
/*
* Push a single buffer on a delwri queue.
*
* The purpose of this function is to submit a single buffer of a delwri queue
* and return with the buffer still on the original queue. The waiting delwri
* buffer submission infrastructure guarantees transfer of the delwri queue
* buffer reference to a temporary wait list. We reuse this infrastructure to
* transfer the buffer back to the original queue.
*
* Note the buffer transitions from the queued state, to the submitted and wait
* listed state and back to the queued state during this call. The buffer
* locking and queue management logic between _delwri_pushbuf() and
* _delwri_queue() guarantee that the buffer cannot be queued to another list
* before returning.
*/
int
xfs_buf_delwri_pushbuf(
struct xfs_buf *bp,
struct list_head *buffer_list)
{
LIST_HEAD (submit_list);
int error;
ASSERT(bp->b_flags & _XBF_DELWRI_Q);
trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
/*
* Isolate the buffer to a new local list so we can submit it for I/O
* independently from the rest of the original list.
*/
xfs_buf_lock(bp);
list_move(&bp->b_list, &submit_list);
xfs_buf_unlock(bp);
/*
* Delwri submission clears the DELWRI_Q buffer flag and returns with
* the buffer on the wait list with an associated reference. Rather than
* bounce the buffer from a local wait list back to the original list
* after I/O completion, reuse the original list as the wait list.
*/
xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
/*
* The buffer is now under I/O and wait listed as during typical delwri
* submission. Lock the buffer to wait for I/O completion. Rather than
* remove the buffer from the wait list and release the reference, we
* want to return with the buffer queued to the original list. The
* buffer already sits on the original list with a wait list reference,
* however. If we let the queue inherit that wait list reference, all we
* need to do is reset the DELWRI_Q flag.
*/
xfs_buf_lock(bp);
error = bp->b_error;
bp->b_flags |= _XBF_DELWRI_Q;
xfs_buf_unlock(bp);
return error;
}
int __init
xfs_buf_init(void)
{

View File

@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);

View File

@@ -29,6 +29,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_inode.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -322,6 +323,8 @@ xfs_buf_item_format(
ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
(xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
&& xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
/*
@@ -346,16 +349,6 @@ xfs_buf_item_format(
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
XFS_BLI_ORDERED) {
/*
* The buffer has been logged just to order it. It is not being
* included in the transaction commit, so don't format it.
*/
trace_xfs_buf_item_format_ordered(bip);
return;
}
for (i = 0; i < bip->bli_format_count; i++) {
xfs_buf_item_format_segment(bip, lv, &vecp, offset,
&bip->bli_formats[i]);
@@ -574,26 +567,20 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
bool clean;
bool aborted;
int flags;
bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
#if defined(DEBUG) || defined(XFS_WARN)
bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
#endif
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
/*
* If this is a transaction abort, don't return early. Instead, allow
* the brelse to happen. Normally it would be done for stale
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
* The per-transaction state has been copied above so clear it from the
* bli.
*/
aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
* Before possibly freeing the buf item, copy the per-transaction state
* so we can reference it safely later after clearing it from the
* buffer log item.
*/
flags = bip->bli_flags;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
@@ -601,7 +588,7 @@ xfs_buf_item_unlock(
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
if (flags & XFS_BLI_STALE) {
if (bip->bli_flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -619,40 +606,34 @@ xfs_buf_item_unlock(
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
*
* Ordered buffers are dirty but may have no recorded changes, so ensure
* we only release clean items here.
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
*/
clean = (flags & XFS_BLI_DIRTY) ? false : true;
if (clean) {
int i;
for (i = 0; i < bip->bli_format_count; i++) {
if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
bip->bli_formats[i].blf_map_size)) {
clean = false;
break;
}
}
}
ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
(ordered && dirty && !xfs_buf_item_dirty_format(bip)));
/*
* Clean buffers, by definition, cannot be in the AIL. However, aborted
* buffers may be dirty and hence in the AIL. Therefore if we are
* aborting a buffer and we've just taken the last refernce away, we
* have to check if it is in the AIL before freeing it. We need to free
* it in this case, because an aborted transaction has already shut the
* filesystem down and this is the last chance we will have to do so.
* buffers may be in the AIL regardless of dirty state. An aborted
* transaction that invalidates a buffer already in the AIL may have
* marked it stale and cleared the dirty state, for example.
*
* Therefore if we are aborting a buffer and we've just taken the last
* reference away, we have to check if it is in the AIL before freeing
* it. We need to free it in this case, because an aborted transaction
* has already shut the filesystem down and this is the last chance we
* will have to do so.
*/
if (atomic_dec_and_test(&bip->bli_refcount)) {
if (clean)
xfs_buf_item_relse(bp);
else if (aborted) {
if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
}
} else if (!dirty)
xfs_buf_item_relse(bp);
}
if (!(flags & XFS_BLI_HOLD))
if (!hold)
xfs_buf_relse(bp);
}
@@ -942,14 +923,22 @@ xfs_buf_item_log(
/*
* Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
* Return true if the buffer has any ranges logged/dirtied by a transaction,
* false otherwise.
*/
uint
xfs_buf_item_dirty(
xfs_buf_log_item_t *bip)
bool
xfs_buf_item_dirty_format(
struct xfs_buf_log_item *bip)
{
return (bip->bli_flags & XFS_BLI_DIRTY);
int i;
for (i = 0; i < bip->bli_format_count; i++) {
if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
bip->bli_formats[i].blf_map_size))
return true;
}
return false;
}
STATIC void
@@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks(
}
}
/*
* Invoke the error state callback for each log item affected by the failed I/O.
*
* If a metadata buffer write fails with a non-permanent error, the buffer is
* eventually resubmitted and so the completion callbacks are not run. The error
* state may need to be propagated to the log items attached to the buffer,
* however, so the next AIL push of the item knows hot to handle it correctly.
*/
STATIC void
xfs_buf_do_callbacks_fail(
struct xfs_buf *bp)
{
struct xfs_log_item *next;
struct xfs_log_item *lip = bp->b_fspriv;
struct xfs_ail *ailp = lip->li_ailp;
spin_lock(&ailp->xa_lock);
for (; lip; lip = next) {
next = lip->li_bio_list;
if (lip->li_ops->iop_error)
lip->li_ops->iop_error(lip, bp);
}
spin_unlock(&ailp->xa_lock);
}
static bool
xfs_buf_iodone_callback_error(
struct xfs_buf *bp)
@@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error(
if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
goto permanent_error;
/* still a transient error, higher layers will retry */
/*
* Still a transient error, run IO completion failure callbacks and let
* the higher layers retry the buffer.
*/
xfs_buf_do_callbacks_fail(bp);
xfs_buf_ioerror(bp, 0);
xfs_buf_relse(bp);
return true;
@@ -1201,3 +1219,31 @@ xfs_buf_iodone(
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
/*
* Requeue a failed buffer for writeback
*
* Return true if the buffer has been re-queued properly, false otherwise
*/
bool
xfs_buf_resubmit_failed_buffers(
struct xfs_buf *bp,
struct xfs_log_item *lip,
struct list_head *buffer_list)
{
struct xfs_log_item *next;
/*
* Clear XFS_LI_FAILED flag from all items before resubmit
*
* XFS_LI_FAILED set/clear is protected by xa_lock, caller this
* function already have it acquired
*/
for (; lip; lip = next) {
next = lip->li_bio_list;
xfs_clear_li_failed(lip);
}
/* Add this buffer back to the delayed write list */
return xfs_buf_delwri_queue(bp, buffer_list);
}

View File

@@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
void(*)(struct xfs_buf *, xfs_log_item_t *),
xfs_log_item_t *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
struct xfs_log_item *,
struct list_head *);
extern kmem_zone_t *xfs_buf_item_zone;

View File

@@ -729,6 +729,7 @@ write_retry:
xfs_rw_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
xfs_icache_free_eofblocks(ip->i_mount, &eofb);
xfs_icache_free_cowblocks(ip->i_mount, &eofb);
goto write_retry;
}
@@ -1139,30 +1140,9 @@ xfs_find_get_desired_pgoff(
want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
want);
/*
* No page mapped into given range. If we are searching holes
* and if this is the first time we got into the loop, it means
* that the given offset is landed in a hole, return it.
*
* If we have already stepped through some block buffers to find
* holes but they all contains data. In this case, the last
* offset is already updated and pointed to the end of the last
* mapped page, if it does not reach the endpoint to search,
* that means there should be a hole between them.
*/
if (nr_pages == 0) {
/* Data search found nothing */
if (type == DATA_OFF)
if (nr_pages == 0)
break;
ASSERT(type == HOLE_OFF);
if (lastoff == startoff || lastoff < endoff) {
found = true;
*offset = lastoff;
}
break;
}
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
loff_t b_offset;
@@ -1227,21 +1207,20 @@ xfs_find_get_desired_pgoff(
/*
* The number of returned pages less than our desired, search
* done. In this case, nothing was found for searching data,
* but we found a hole behind the last offset.
* done.
*/
if (nr_pages < want) {
if (type == HOLE_OFF) {
*offset = lastoff;
found = true;
}
if (nr_pages < want)
break;
}
index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
/* No page at lastoff and we are not done - we found a hole. */
if (type == HOLE_OFF && lastoff < endoff) {
*offset = lastoff;
found = true;
}
out:
pagevec_release(&pvec);
return found;

View File

@@ -66,7 +66,6 @@ xfs_inode_alloc(
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
ASSERT(spin_is_locked(&pag->pag_ici_lock));
lockdep_assert_held(&pag->pag_ici_lock);
if (pag->pag_ici_reclaimable++)
return;
@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
ASSERT(spin_is_locked(&pag->pag_ici_lock));
lockdep_assert_held(&pag->pag_ici_lock);
if (--pag->pag_ici_reclaimable)
return;
@@ -1079,11 +1078,11 @@ reclaim:
* Because we use RCU freeing we need to ensure the inode always appears
* to be reclaimed with an invalid inode number when in the free state.
* We do this as early as possible under the ILOCK so that
* xfs_iflush_cluster() can be guaranteed to detect races with us here.
* By doing this, we guarantee that once xfs_iflush_cluster has locked
* XFS_ILOCK that it will see either a valid, flushable inode that will
* serialise correctly, or it will see a clean (and invalid) inode that
* it can skip.
* xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
* detect races with us here. By doing this, we guarantee that once
* xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
* it will see either a valid inode that will serialise correctly, or it
* will see an invalid inode that it can skip.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;

View File

@@ -881,7 +881,6 @@ xfs_ialloc(
case S_IFREG:
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
uint64_t di_flags2 = 0;
uint di_flags = 0;
if (S_ISDIR(mode)) {
@@ -918,20 +917,23 @@ xfs_ialloc(
di_flags |= XFS_DIFLAG_NODEFRAG;
if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
ip->i_d.di_flags |= di_flags;
ip->i_d.di_flags2 |= di_flags2;
}
if (pip &&
(pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
pip->i_d.di_version == 3 &&
ip->i_d.di_version == 3) {
uint64_t di_flags2 = 0;
if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
}
if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
ip->i_d.di_flags2 |= di_flags2;
}
/* FALLTHROUGH */
case S_IFLNK:
@@ -2366,12 +2368,25 @@ retry:
* already marked stale. If we can't lock it, back off
* and retry.
*/
if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
}
/*
* Check the inode number again in case we're
* racing with freeing in xfs_reclaim_inode().
* See the comments in that function for more
* information as to why the initial check is
* not sufficient.
*/
if (ip->i_ino != inum + i) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
}
rcu_read_unlock();
xfs_iflock(ip);

View File

@@ -27,6 +27,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
/*
* Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
* have been failed during writeback
*
* This informs the AIL that the inode is already flush locked on the next push,
* and acquires a hold on the buffer to ensure that it isn't reclaimed before
* dirty data makes it to disk.
*/
STATIC void
xfs_inode_item_error(
struct xfs_log_item *lip,
struct xfs_buf *bp)
{
ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
xfs_set_li_failed(lip, bp);
}
STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
@@ -484,13 +502,28 @@ xfs_inode_item_push(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
struct xfs_buf *bp = NULL;
struct xfs_buf *bp = lip->li_buf;
uint rval = XFS_ITEM_SUCCESS;
int error;
if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
/*
* The buffer containing this item failed to be written back
* previously. Resubmit the buffer for IO.
*/
if (lip->li_flags & XFS_LI_FAILED) {
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_unlock(bp);
return rval;
}
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_unlock = xfs_inode_item_unlock,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
.iop_committing = xfs_inode_item_committing
.iop_committing = xfs_inode_item_committing,
.iop_error = xfs_inode_item_error
};
@@ -710,7 +744,8 @@ xfs_iflush_done(
* the AIL lock.
*/
iip = INODE_ITEM(blip);
if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
lip->li_flags & XFS_LI_FAILED)
need_ail++;
blip = next;
@@ -718,7 +753,8 @@ xfs_iflush_done(
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
lip->li_flags & XFS_LI_FAILED)
need_ail++;
/*
@@ -731,22 +767,30 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
struct xfs_log_item *log_items[need_ail];
int i = 0;
bool mlip_changed = false;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->xa_lock);
for (blip = lip; blip; blip = blip->li_bio_list) {
iip = INODE_ITEM(blip);
if (iip->ili_logged &&
blip->li_lsn == iip->ili_flush_lsn) {
log_items[i++] = blip;
if (INODE_ITEM(blip)->ili_logged &&
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
mlip_changed |= xfs_ail_delete_one(ailp, blip);
else {
xfs_clear_li_failed(blip);
}
ASSERT(i <= need_ail);
}
/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
xfs_trans_ail_delete_bulk(ailp, log_items, i,
SHUTDOWN_CORRUPT_INCORE);
}
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
xlog_assign_tail_lsn_locked(ailp->xa_mount);
if (list_empty(&ailp->xa_ail))
wake_up_all(&ailp->xa_empty);
}
spin_unlock(&ailp->xa_lock);
if (mlip_changed)
xfs_log_space_wake(ailp->xa_mount);
}
/*
* clean up and unlock the flush lock now we are done. We can clear the

View File

@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
return 0;
}
STATIC void
xfs_set_diflags(
STATIC uint16_t
xfs_flags2diflags(
struct xfs_inode *ip,
unsigned int xflags)
{
unsigned int di_flags;
uint64_t di_flags2;
/* can't set PREALLOC this way, just preserve it */
di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
uint16_t di_flags =
(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
if (xflags & FS_XFLAG_APPEND)
@@ -967,19 +966,24 @@ xfs_set_diflags(
if (xflags & FS_XFLAG_EXTSIZE)
di_flags |= XFS_DIFLAG_EXTSIZE;
}
ip->i_d.di_flags = di_flags;
/* diflags2 only valid for v3 inodes. */
if (ip->i_d.di_version < 3)
return;
return di_flags;
}
STATIC uint64_t
xfs_flags2diflags2(
struct xfs_inode *ip,
unsigned int xflags)
{
uint64_t di_flags2 =
(ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
if (xflags & FS_XFLAG_COWEXTSIZE)
di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_flags2 = di_flags2;
return di_flags2;
}
STATIC void
@@ -1005,11 +1009,12 @@ xfs_diflags_to_linux(
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
#if 0 /* disabled until the flag switching races are sorted out */
if (xflags & FS_XFLAG_DAX)
inode->i_flags |= S_DAX;
else
inode->i_flags &= ~S_DAX;
#endif
}
static int
@@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
uint64_t di_flags2;
/* Can't change realtime flag if any extents are allocated. */
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
@@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
xfs_set_diflags(ip, fa->fsx_xflags);
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (di_flags2 && ip->i_d.di_version < 3)
return -EINVAL;
ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
ip->i_d.di_flags2 = di_flags2;
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

View File

@@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize(
* Caution: The caller of this function is responsible for calling
* setattr_prepare() or otherwise verifying the change is fine.
*/
int
STATIC int
xfs_setattr_size(
struct xfs_inode *ip,
struct iattr *iattr)

View File

@@ -743,15 +743,45 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
int error = 0;
bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
} else if (readonly) {
/* Allow unlinked processing to proceed */
mp->m_flags &= ~XFS_MOUNT_RDONLY;
}
/*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
* of inodes before we're done replaying log items on those
* inodes. Turn it off immediately after recovery finishes
* so that we don't leak the quota inodes if subsequent mount
* activities fail.
*
* We let all inodes involved in redo item processing end up on
* the LRU instead of being evicted immediately so that if we do
* something to an unlinked inode, the irele won't cause
* premature truncation and freeing of the inode, which results
* in log recovery failure. We have to evict the unreferenced
* lru inodes after clearing MS_ACTIVE because we don't
* otherwise clean up the lru if there's a subsequent failure in
* xfs_mountfs, which leads to us leaking the inodes if nothing
* else (e.g. quotacheck) references the inodes before the
* mount failure occurs.
*/
mp->m_super->s_flags |= MS_ACTIVE;
error = xlog_recover_finish(mp->m_log);
if (!error)
xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~MS_ACTIVE;
evict_inodes(mp->m_super);
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
return error;
}
@@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
int error;
/*
* Don't write out unmount record on read-only mounts.
* Don't write out unmount record on norecovery mounts or ro devices.
* Or, if we are doing a forced umount (typically because of IO errors).
*/
if (mp->m_flags & XFS_MOUNT_RDONLY)
if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
}
error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
@@ -3304,8 +3337,6 @@ maybe_sleep:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
if (log_flushed)
*log_flushed = 1;
} else {
no_sleep:
@@ -3409,8 +3440,6 @@ try_again:
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
if (log_flushed)
*log_flushed = 1;
already_slept = 1;
goto try_again;
}
@@ -3444,9 +3473,6 @@ try_again:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
if (log_flushed)
*log_flushed = 1;
} else { /* just return */
spin_unlock(&log->l_icloglock);
}

View File

@@ -1029,61 +1029,106 @@ out_error:
}
/*
* Check the log tail for torn writes. This is required when torn writes are
* detected at the head and the head had to be walked back to a previous record.
* The tail of the previous record must now be verified to ensure the torn
* writes didn't corrupt the previous tail.
* Calculate distance from head to tail (i.e., unused space in the log).
*/
static inline int
xlog_tail_distance(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
if (head_blk < tail_blk)
return tail_blk - head_blk;
return tail_blk + (log->l_logBBsize - head_blk);
}
/*
* Verify the log tail. This is particularly important when torn or incomplete
* writes have been detected near the front of the log and the head has been
* walked back accordingly.
*
* Return an error if CRC verification fails as recovery cannot proceed.
* We also have to handle the case where the tail was pinned and the head
* blocked behind the tail right before a crash. If the tail had been pushed
* immediately prior to the crash and the subsequent checkpoint was only
* partially written, it's possible it overwrote the last referenced tail in the
* log with garbage. This is not a coherency problem because the tail must have
* been pushed before it can be overwritten, but appears as log corruption to
* recovery because we have no way to know the tail was updated if the
* subsequent checkpoint didn't write successfully.
*
* Therefore, CRC check the log from tail to head. If a failure occurs and the
* offending record is within max iclog bufs from the head, walk the tail
* forward and retry until a valid tail is found or corruption is detected out
* of the range of a possible overwrite.
*/
STATIC int
xlog_verify_tail(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
xfs_daddr_t *tail_blk,
int hsize)
{
struct xlog_rec_header *thead;
struct xfs_buf *bp;
xfs_daddr_t first_bad;
int count;
int error = 0;
bool wrapped;
xfs_daddr_t tmp_head;
xfs_daddr_t tmp_tail;
xfs_daddr_t orig_tail = *tail_blk;
bp = xlog_get_bp(log, 1);
if (!bp)
return -ENOMEM;
/*
* Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
* a temporary head block that points after the last possible
* concurrently written record of the tail.
* Make sure the tail points to a record (returns positive count on
* success).
*/
count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
&wrapped);
if (count < 0) {
error = count;
error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
&tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
if (*tail_blk != tmp_tail)
*tail_blk = tmp_tail;
/*
* Run a CRC check from the tail to the head. We can't just check
* MAX_ICLOGS records past the tail because the tail may point to stale
* blocks cleared during the search for the head/tail. These blocks are
* overwritten with zero-length records and thus record count is not a
* reliable indicator of the iclog state before a crash.
*/
first_bad = 0;
error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
int tail_distance;
/*
* Is corruption within range of the head? If so, retry from
* the next record. Otherwise return an error.
*/
tail_distance = xlog_tail_distance(log, head_blk, first_bad);
if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
break;
/* skip to the next record; returns positive count on success */
error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
&tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
*tail_blk = tmp_tail;
first_bad = 0;
error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
}
/*
* If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
* into the actual log head. tmp_head points to the start of the record
* so update it to the actual head block.
*/
if (count < XLOG_MAX_ICLOGS + 1)
tmp_head = head_blk;
/*
* We now have a tail and temporary head block that covers at least
* XLOG_MAX_ICLOGS records from the tail. We need to verify that these
* records were completely written. Run a CRC verification pass from
* tail to head and return the result.
*/
error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
if (!error && *tail_blk != orig_tail)
xfs_warn(log->l_mp,
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
xlog_put_bp(bp);
return error;
@@ -1143,7 +1188,7 @@ xlog_verify_head(
*/
error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
if (error == -EFSBADCRC) {
if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
/*
* We've hit a potential torn write. Reset the error and warn
* about it.
@@ -1183,31 +1228,12 @@ xlog_verify_head(
ASSERT(0);
return 0;
}
/*
* Now verify the tail based on the updated head. This is
* required because the torn writes trimmed from the head could
* have been written over the tail of a previous record. Return
* any errors since recovery cannot proceed if the tail is
* corrupt.
*
* XXX: This leaves a gap in truly robust protection from torn
* writes in the log. If the head is behind the tail, the tail
* pushes forward to create some space and then a crash occurs
* causing the writes into the previous record's tail region to
* tear, log recovery isn't able to recover.
*
* How likely is this to occur? If possible, can we do something
* more intelligent here? Is it safe to push the tail forward if
* we can determine that the tail is within the range of the
* torn write (e.g., the kernel can only overwrite the tail if
* it has actually been pushed forward)? Alternatively, could we
* somehow prevent this condition at runtime?
*/
error = xlog_verify_tail(log, *head_blk, *tail_blk);
}
if (error)
return error;
return xlog_verify_tail(log, *head_blk, tail_blk,
be32_to_cpu((*rhead)->h_size));
}
/*
@@ -4152,7 +4178,7 @@ xlog_recover_commit_trans(
#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
hlist_del(&trans->r_list);
hlist_del_init(&trans->r_list);
error = xlog_recover_reorder_trans(log, trans, pass);
if (error)
@@ -4354,6 +4380,8 @@ xlog_recover_free_trans(
xlog_recover_item_t *item, *n;
int i;
hlist_del_init(&trans->r_list);
list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
/* Free the regions in the item. */
list_del(&item->ri_list);
@@ -4799,12 +4827,16 @@ xlog_recover_process_intents(
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
ailp = log->l_ailp;
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
while (lip != NULL) {
/*
* We're done when we see something other than an intent.
@@ -5214,7 +5246,7 @@ xlog_do_recovery_pass(
xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
@@ -5222,11 +5254,15 @@ xlog_do_recovery_pass(
int error2 = 0;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
int i;
struct hlist_head rhash[XLOG_RHASH_SIZE];
LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
rhead_blk = 0;
blk_no = rhead_blk = tail_blk;
for (i = 0; i < XLOG_RHASH_SIZE; i++)
INIT_HLIST_HEAD(&rhash[i]);
/*
* Read the header of the tail block and get the iclog buffer size from
@@ -5301,7 +5337,6 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@@ -5363,9 +5398,19 @@ xlog_do_recovery_pass(
bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
blk_no += hblks;
/* Read in data for log record */
if (blk_no + bblks <= log->l_logBBsize) {
error = xlog_bread(log, blk_no, bblks, dbp,
/*
* Read the log record data in multiple reads if it
* wraps around the end of the log. Note that if the
* header already wrapped, blk_no could point past the
* end of the log. The record data is contiguous in
* that case.
*/
if (blk_no + bblks <= log->l_logBBsize ||
blk_no >= log->l_logBBsize) {
/* mod blk_no in case the header wrapped and
* pushed it beyond the end of the log */
rblk_no = do_mod(blk_no, log->l_logBBsize);
error = xlog_bread(log, rblk_no, bblks, dbp,
&offset);
if (error)
goto bread_err2;
@@ -5464,6 +5509,19 @@ xlog_do_recovery_pass(
if (error && first_bad)
*first_bad = rhead_blk;
/*
* Transactions are freed at commit time but transactions without commit
* records on disk are never committed. Free any that may be left in the
* hash table.
*/
for (i = 0; i < XLOG_RHASH_SIZE; i++) {
struct hlist_node *tmp;
struct xlog_recover *trans;
hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
xlog_recover_free_trans(trans);
}
return error ? error : error2;
}
@@ -5542,6 +5600,8 @@ xlog_do_recover(
xfs_buf_t *bp;
xfs_sb_t *sbp;
trace_xfs_log_recover(log, head_blk, tail_blk);
/*
* First replay the images in the log.
*/

View File

@@ -924,15 +924,6 @@ xfs_mountfs(
}
}
/*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
* of inodes before we're done replaying log items on those
* inodes.
*/
mp->m_super->s_flags |= MS_ACTIVE;
/*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
@@ -1008,12 +999,13 @@ xfs_mountfs(
out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
mp->m_super->s_flags &= ~MS_ACTIVE;
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);

View File

@@ -111,6 +111,9 @@ restart:
skipped = 0;
break;
}
/* we're done if id overflows back to zero */
if (!next_index)
break;
}
if (skipped) {
@@ -1247,6 +1250,7 @@ xfs_qm_flush_one(
struct xfs_dquot *dqp,
void *data)
{
struct xfs_mount *mp = dqp->q_mount;
struct list_head *buffer_list = data;
struct xfs_buf *bp = NULL;
int error = 0;
@@ -1257,7 +1261,32 @@ xfs_qm_flush_one(
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
xfs_dqflock(dqp);
/*
* The only way the dquot is already flush locked by the time quotacheck
* gets here is if reclaim flushed it before the dqadjust walk dirtied
* it for the final time. Quotacheck collects all dquot bufs in the
* local delwri queue before dquots are dirtied, so reclaim can't have
* possibly queued it for I/O. The only way out is to push the buffer to
* cycle the flush lock.
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen);
bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
if (!bp) {
error = -EINVAL;
goto out_unlock;
}
xfs_buf_unlock(bp);
xfs_buf_delwri_pushbuf(bp, buffer_list);
xfs_buf_rele(bp);
error = -EAGAIN;
goto out_unlock;
}
error = xfs_qm_dqflush(dqp, &bp);
if (error)
goto out_unlock;

View File

@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
return error;
if (!agbp)
return -ENOMEM;
cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
@@ -333,7 +335,7 @@ xfs_reflink_convert_cow_extent(
struct xfs_defer_ops *dfops)
{
struct xfs_bmbt_irec irec = *imap;
xfs_fsblock_t first_block;
xfs_fsblock_t first_block = NULLFSBLOCK;
int nimaps = 1;
if (imap->br_state == XFS_EXT_NORM)

View File

@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
tmp_mp->m_super = sb;
error = xfs_parseargs(tmp_mp, options);
xfs_free_fsname(tmp_mp);
kfree(tmp_mp);
kmem_free(tmp_mp);
return error;
}

View File

@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
@@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
@@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
TRACE_EVENT(xfs_log_recover,
TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
TP_ARGS(log, headblk, tailblk),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, headblk)
__field(xfs_daddr_t, tailblk)
),
TP_fast_assign(
__entry->dev = log->l_mp->m_super->s_dev;
__entry->headblk = headblk;
__entry->tailblk = tailblk;
),
TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
__entry->tailblk)
)
TRACE_EVENT(xfs_log_recover_record,
TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
TP_ARGS(log, rhead, pass),

View File

@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
uint li_flags; /* misc flags */
struct xfs_buf *li_buf; /* real buffer pointer */
struct xfs_log_item *li_bio_list; /* buffer item list */
void (*li_cb)(struct xfs_buf *,
struct xfs_log_item *);
@@ -66,10 +67,12 @@ typedef struct xfs_log_item {
#define XFS_LI_IN_AIL 0x1
#define XFS_LI_ABORTED 0x2
#define XFS_LI_FAILED 0x4
#define XFS_LI_FLAGS \
{ XFS_LI_IN_AIL, "IN_AIL" }, \
{ XFS_LI_ABORTED, "ABORTED" }
{ XFS_LI_ABORTED, "ABORTED" }, \
{ XFS_LI_FAILED, "FAILED" }
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -80,6 +83,7 @@ struct xfs_item_ops {
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
};
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
@@ -213,12 +217,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
uint);
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
void xfs_extent_free_init_defer_op(void);
@@ -277,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
enum xfs_bmap_intent_type type, struct xfs_inode *ip,
int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
xfs_filblks_t blockcount, xfs_exntst_t state);
xfs_filblks_t *blockcount, xfs_exntst_t state);
#endif /* __XFS_TRANS_H__ */

View File

@@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk(
}
}
/*
* xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
bool
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
xfs_clear_li_failed(lip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;
return mlip == lip;
}
/**
* Remove a log items from the AIL
*
* @xfs_trans_ail_delete_bulk takes an array of log items that all need to
* removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,23 +722,15 @@ xfs_trans_ail_update_bulk(
* before returning.
*/
void
xfs_trans_ail_delete_bulk(
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item **log_items,
int nr_items,
struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->xa_lock)
{
xfs_log_item_t *mlip;
int mlip_changed = 0;
int i;
mlip = xfs_ail_min(ailp);
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
if (!(lip->li_flags & XFS_LI_IN_AIL)) {
struct xfs_mount *mp = ailp->xa_mount;
bool mlip_changed;
if (!(lip->li_flags & XFS_LI_IN_AIL)) {
spin_unlock(&ailp->xa_lock);
if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
@@ -733,25 +741,17 @@ xfs_trans_ail_delete_bulk(
return;
}
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;
if (mlip == lip)
mlip_changed = 1;
}
mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
xlog_assign_tail_lsn_locked(ailp->xa_mount);
if (!XFS_FORCED_SHUTDOWN(mp))
xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->xa_ail))
wake_up_all(&ailp->xa_empty);
spin_unlock(&ailp->xa_lock);
xfs_log_space_wake(ailp->xa_mount);
} else {
spin_unlock(&ailp->xa_lock);
}
spin_unlock(&ailp->xa_lock);
if (mlip_changed)
xfs_log_space_wake(ailp->xa_mount);
}
int

View File

@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
xfs_filblks_t blockcount,
xfs_filblks_t *blockcount,
xfs_exntst_t state)
{
int error;
@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
void **state)
{
struct xfs_bmap_intent *bmap;
xfs_filblks_t count;
int error;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
count = bmap->bi_bmap.br_blockcount;
error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
bmap->bi_type,
bmap->bi_owner, bmap->bi_whichfork,
bmap->bi_bmap.br_startoff,
bmap->bi_bmap.br_startblock,
bmap->bi_bmap.br_blockcount,
&count,
bmap->bi_bmap.br_state);
if (!error && count > 0) {
ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
bmap->bi_bmap.br_blockcount = count;
return -EAGAIN;
}
kmem_free(bmap);
return error;
}

View File

@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
xfs_buf_t *bp)
{
xfs_buf_log_item_t *bip;
int freed;
/*
* Default to a normal brelse() call if the tp is NULL.
@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp,
/*
* Drop our reference to the buf log item.
*/
atomic_dec(&bip->bli_refcount);
freed = atomic_dec_and_test(&bip->bli_refcount);
/*
* If the buf item is not tracking data in the log, then
* we must free it before releasing the buffer back to the
* free pool. Before releasing the buffer to the free pool,
* clear the transaction pointer in b_fsprivate2 to dissolve
* its relation to this transaction.
* If the buf item is not tracking data in the log, then we must free it
* before releasing the buffer back to the free pool.
*
* If the fs has shutdown and we dropped the last reference, it may fall
* on us to release a (possibly dirty) bli if it never made it to the
* AIL (e.g., the aborted unpin already happened and didn't release it
* due to our reference). Since we're already shutdown and need xa_lock,
* just force remove from the AIL and release the bli here.
*/
if (!xfs_buf_item_dirty(bip)) {
if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
} else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
/***
ASSERT(bp->b_pincount == 0);
***/
@@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
}
/*
* This is called to mark bytes first through last inclusive of the given
* buffer as needing to be logged when the transaction is committed.
* The buffer must already be associated with the given transaction.
*
* First and last are numbers relative to the beginning of this buffer,
* so the first byte in the buffer is numbered 0 regardless of the
* value of b_blkno.
* Mark a buffer dirty in the transaction.
*/
void
xfs_trans_log_buf(xfs_trans_t *tp,
xfs_buf_t *bp,
uint first,
uint last)
xfs_trans_dirty_buf(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
xfs_buf_log_item_t *bip = bp->b_fspriv;
struct xfs_buf_log_item *bip = bp->b_fspriv;
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(bp->b_iodone == NULL ||
bp->b_iodone == xfs_buf_iodone_callbacks);
@@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_iodone = xfs_buf_iodone_callbacks;
bip->bli_item.li_cb = xfs_buf_iodone;
trace_xfs_trans_log_buf(bip);
/*
* If we invalidated the buffer within this transaction, then
* cancel the invalidation now that we're dirtying the buffer
@@ -538,16 +535,36 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_flags &= ~XBF_STALE;
bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
}
/*
* If we have an ordered buffer we are not logging any dirty range but
* it still needs to be marked dirty and that it has been logged.
/*
* This is called to mark bytes first through last inclusive of the given
* buffer as needing to be logged when the transaction is committed.
* The buffer must already be associated with the given transaction.
*
* First and last are numbers relative to the beginning of this buffer,
* so the first byte in the buffer is numbered 0 regardless of the
* value of b_blkno.
*/
bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
if (!(bip->bli_flags & XFS_BLI_ORDERED))
void
xfs_trans_log_buf(
struct xfs_trans *tp,
struct xfs_buf *bp,
uint first,
uint last)
{
struct xfs_buf_log_item *bip = bp->b_fspriv;
ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
xfs_trans_dirty_buf(tp, bp);
trace_xfs_trans_log_buf(bip);
xfs_buf_item_log(bip, first, last);
}
@@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf(
}
/*
* Mark the buffer as ordered for this transaction. This means
* that the contents of the buffer are not recorded in the transaction
* but it is tracked in the AIL as though it was. This allows us
* to record logical changes in transactions rather than the physical
* changes we make to the buffer without changing writeback ordering
* constraints of metadata buffers.
* Mark the buffer as ordered for this transaction. This means that the contents
* of the buffer are not recorded in the transaction but it is tracked in the
* AIL as though it was. This allows us to record logical changes in
* transactions rather than the physical changes we make to the buffer without
* changing writeback ordering constraints of metadata buffers.
*/
void
bool
xfs_trans_ordered_buf(
struct xfs_trans *tp,
struct xfs_buf *bp)
@@ -719,8 +735,18 @@ xfs_trans_ordered_buf(
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
if (xfs_buf_item_dirty_format(bip))
return false;
bip->bli_flags |= XFS_BLI_ORDERED;
trace_xfs_buf_item_ordered(bip);
/*
* We don't log a dirty range of an ordered buffer but it still needs
* to be marked dirty and that it has been logged.
*/
xfs_trans_dirty_buf(tp, bp);
return true;
}
/*

View File

@@ -106,18 +106,9 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
struct xfs_log_item **log_items, int nr_items,
int shutdown_type)
__releases(ailp->xa_lock);
static inline void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
xfs_log_item_t *lip,
int shutdown_type) __releases(ailp->xa_lock)
{
xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
}
bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->xa_lock);
static inline void
xfs_trans_ail_remove(
@@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn(
*dst = *src;
}
#endif
static inline void
xfs_clear_li_failed(
struct xfs_log_item *lip)
{
struct xfs_buf *bp = lip->li_buf;
ASSERT(lip->li_flags & XFS_LI_IN_AIL);
lockdep_assert_held(&lip->li_ailp->xa_lock);
if (lip->li_flags & XFS_LI_FAILED) {
lip->li_flags &= ~XFS_LI_FAILED;
lip->li_buf = NULL;
xfs_buf_rele(bp);
}
}
static inline void
xfs_set_li_failed(
struct xfs_log_item *lip,
struct xfs_buf *bp)
{
lockdep_assert_held(&lip->li_ailp->xa_lock);
if (!(lip->li_flags & XFS_LI_FAILED)) {
xfs_buf_hold(bp);
lip->li_flags |= XFS_LI_FAILED;
lip->li_buf = bp;
}
}
#endif /* __XFS_TRANS_PRIV_H__ */

View File

@@ -2782,6 +2782,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);

View File

@@ -3901,6 +3901,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
updev; \
updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
bool netdev_has_any_upper_dev(struct net_device *dev);
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,

View File

@@ -1,14 +1,9 @@
#ifndef __NET_FRAG_H__
#define __NET_FRAG_H__
#include <linux/percpu_counter.h>
struct netns_frags {
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
struct percpu_counter mem ____cacheline_aligned_in_smp;
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */
int timeout;
int high_thresh;
@@ -108,15 +103,10 @@ struct inet_frags {
int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *);
static inline int inet_frags_init_net(struct netns_frags *nf)
static inline void inet_frags_init_net(struct netns_frags *nf)
{
return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
atomic_set(&nf->mem, 0);
}
static inline void inet_frags_uninit_net(struct netns_frags *nf)
{
percpu_counter_destroy(&nf->mem);
}
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
@@ -140,37 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
/* Memory Tracking Functions. */
/* The default percpu_counter batch size is not big enough to scale to
* fragmentation mem acct sizes.
* The mem size of a 64K fragment is approx:
* (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
*/
static unsigned int frag_percpu_counter_batch = 130000;
static inline int frag_mem_limit(struct netns_frags *nf)
{
return percpu_counter_read(&nf->mem);
return atomic_read(&nf->mem);
}
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
{
__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
atomic_sub(i, &nf->mem);
}
static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
{
__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
atomic_add(i, &nf->mem);
}
static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
static inline int sum_frag_mem_limit(struct netns_frags *nf)
{
unsigned int res;
local_bh_disable();
res = percpu_counter_sum_positive(&nf->mem);
local_bh_enable();
return res;
return atomic_read(&nf->mem);
}
/* RFC 3168 support :

View File

@@ -68,6 +68,7 @@ struct fib6_node {
__u16 fn_flags;
int fn_sernum;
struct rt6_info *rr_ptr;
struct rcu_head rcu;
};
#ifndef CONFIG_IPV6_SUBTREES
@@ -102,7 +103,7 @@ struct rt6_info {
* the same cache line.
*/
struct fib6_table *rt6i_table;
struct fib6_node *rt6i_node;
struct fib6_node __rcu *rt6i_node;
struct in6_addr rt6i_gateway;
@@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
rt0->rt6i_flags |= RTF_EXPIRES;
}
/* Function to safely get fn->sernum for passed in rt
* and store result in passed in cookie.
* Return true if we can get cookie safely
* Return false if not
*/
static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
u32 *cookie)
{
struct fib6_node *fn;
bool status = false;
rcu_read_lock();
fn = rcu_dereference(rt->rt6i_node);
if (fn) {
*cookie = fn->fn_sernum;
status = true;
}
rcu_read_unlock();
return status;
}
static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
u32 cookie = 0;
if (rt->rt6i_flags & RTF_PCPU ||
(unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from))
rt = (struct rt6_info *)(rt->dst.from);
return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
rt6_get_cookie_safe(rt, &cookie);
return cookie;
}
static inline void ip6_rt_put(struct rt6_info *rt)

View File

@@ -48,6 +48,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
#ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
#endif
BR_INPUT_SKB_CB(skb)->brdev = dev;
skb_reset_mac_header(skb);

View File

@@ -351,7 +351,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
if (flags & MSG_PEEK) {
err = -ENOENT;
spin_lock_bh(&sk->sk_receive_queue.lock);
if (skb == skb_peek(&sk->sk_receive_queue)) {
if (skb->next) {
__skb_unlink(skb, &sk->sk_receive_queue);
atomic_dec(&skb->users);
err = 0;

View File

@@ -5337,12 +5337,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
* Find out if a device is linked to an upper device and return true in case
* it is. The caller must hold the RTNL lock.
*/
static bool netdev_has_any_upper_dev(struct net_device *dev)
bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
return !list_empty(&dev->all_adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_any_upper_dev);
/**
* netdev_master_upper_dev_get - Get master upper device

View File

@@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
{
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
int res;
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
res = inet_frags_init_net(&ieee802154_lowpan->frags);
if (res)
return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res)
inet_frags_uninit_net(&ieee802154_lowpan->frags);
return res;
inet_frags_init_net(&ieee802154_lowpan->frags);
return lowpan_frags_ns_sysctl_register(net);
}
static void __net_exit lowpan_frags_exit_net(struct net *net)

View File

@@ -234,10 +234,8 @@ evict_again:
cond_resched();
if (read_seqretry(&f->rnd_seqlock, seq) ||
percpu_counter_sum(&nf->mem))
sum_frag_mem_limit(nf))
goto evict_again;
percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);

View File

@@ -835,8 +835,6 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
int res;
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -862,13 +860,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
net->ipv4.frags.max_dist = 64;
res = inet_frags_init_net(&net->ipv4.frags);
if (res)
return res;
res = ip4_frags_ns_ctl_register(net);
if (res)
inet_frags_uninit_net(&net->ipv4.frags);
return res;
inet_frags_init_net(&net->ipv4.frags);
return ip4_frags_ns_ctl_register(net);
}
static void __net_exit ipv4_frags_exit_net(struct net *net)

View File

@@ -609,8 +609,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
ip_rt_put(rt);
goto tx_dropped;
}
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
dev->stats.tx_errors++;

View File

@@ -2297,6 +2297,10 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
* issue in __tcp_select_window()
*/
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);

View File

@@ -5474,7 +5474,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
* our DAD process, so we don't need
* to do it again
*/
if (!(ifp->rt->rt6i_node))
if (!rcu_access_pointer(ifp->rt->rt6i_node))
ip6_ins_rt(ifp->rt);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);

View File

@@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void)
return fn;
}
static void node_free(struct fib6_node *fn)
static void node_free_immediate(struct fib6_node *fn)
{
kmem_cache_free(fib6_node_kmem, fn);
}
static void node_free_rcu(struct rcu_head *head)
{
struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
kmem_cache_free(fib6_node_kmem, fn);
}
static void node_free(struct fib6_node *fn)
{
call_rcu(&fn->rcu, node_free_rcu);
}
static void rt6_rcu_free(struct rt6_info *rt)
{
call_rcu(&rt->dst.rcu_head, dst_rcu_free);
@@ -189,6 +201,12 @@ static void rt6_release(struct rt6_info *rt)
}
}
static void fib6_free_table(struct fib6_table *table)
{
inetpeer_invalidate_tree(&table->tb6_peers);
kfree(table);
}
static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
unsigned int h;
@@ -589,9 +607,9 @@ insert_above:
if (!in || !ln) {
if (in)
node_free(in);
node_free_immediate(in);
if (ln)
node_free(ln);
node_free_immediate(ln);
return ERR_PTR(-ENOMEM);
}
@@ -862,7 +880,7 @@ add:
rt->dst.rt6_next = iter;
*ins = rt;
rt->rt6i_node = fn;
rcu_assign_pointer(rt->rt6i_node, fn);
atomic_inc(&rt->rt6i_ref);
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
@@ -887,7 +905,7 @@ add:
return err;
*ins = rt;
rt->rt6i_node = fn;
rcu_assign_pointer(rt->rt6i_node, fn);
rt->dst.rt6_next = iter->dst.rt6_next;
atomic_inc(&rt->rt6i_ref);
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -1020,7 +1038,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
root, and then (in failure) stale node
in main tree.
*/
node_free(sfn);
node_free_immediate(sfn);
err = PTR_ERR(sn);
goto failure;
}
@@ -1447,8 +1465,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
int fib6_del(struct rt6_info *rt, struct nl_info *info)
{
struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
struct net *net = info->nl_net;
struct fib6_node *fn = rt->rt6i_node;
struct rt6_info **rtp;
#if RT6_DEBUG >= 2
@@ -1637,7 +1656,9 @@ static int fib6_clean_node(struct fib6_walker *w)
if (res) {
#if RT6_DEBUG >= 2
pr_debug("%s: del failed: rt=%p@%p err=%d\n",
__func__, rt, rt->rt6i_node, res);
__func__, rt,
rcu_access_pointer(rt->rt6i_node),
res);
#endif
continue;
}
@@ -1878,15 +1899,22 @@ out_timer:
static void fib6_net_exit(struct net *net)
{
unsigned int i;
rt6_ifdown(net, NULL);
del_timer_sync(&net->ipv6.ip6_fib_timer);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
kfree(net->ipv6.fib6_local_tbl);
#endif
inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
kfree(net->ipv6.fib6_main_tbl);
for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
struct hlist_head *head = &net->ipv6.fib_table_hash[i];
struct hlist_node *tmp;
struct fib6_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
hlist_del(&tb->tb6_hlist);
fib6_free_table(tb);
}
}
kfree(net->ipv6.fib_table_hash);
kfree(net->ipv6.rt6_stats);
}

View File

@@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
break;
case ICMPV6_PKT_TOOBIG:
mtu = be32_to_cpu(info) - offset;
mtu = be32_to_cpu(info) - offset - t->tun_hlen;
if (t->dev->type == ARPHRD_ETHER)
mtu -= ETH_HLEN;
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
t->dev->mtu = mtu;

View File

@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
static int nf_ct_net_init(struct net *net)
{
int res;
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
res = inet_frags_init_net(&net->nf_frag.frags);
if (res)
return res;
res = nf_ct_frag6_sysctl_register(net);
if (res)
inet_frags_uninit_net(&net->nf_frag.frags);
return res;
inet_frags_init_net(&net->nf_frag.frags);
return nf_ct_frag6_sysctl_register(net);
}
static void nf_ct_net_exit(struct net *net)

View File

@@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
while (offset <= packet_len) {
struct ipv6_opt_hdr *exthdr;
unsigned int len;
switch (**nexthdr) {
@@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
offset);
len = ipv6_optlen(exthdr);
if (len + offset >= IPV6_MAXPLEN)
offset += ipv6_optlen(exthdr);
if (offset > IPV6_MAXPLEN)
return -EINVAL;
offset += len;
*nexthdr = &exthdr->nexthdr;
}

View File

@@ -709,19 +709,13 @@ static void ip6_frags_sysctl_unregister(void)
static int __net_init ipv6_frags_init_net(struct net *net)
{
int res;
net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
res = inet_frags_init_net(&net->ipv6.frags);
if (res)
return res;
res = ip6_frags_ns_sysctl_register(net);
if (res)
inet_frags_uninit_net(&net->ipv6.frags);
return res;
inet_frags_init_net(&net->ipv6.frags);
return ip6_frags_ns_sysctl_register(net);
}
static void __net_exit ipv6_frags_exit_net(struct net *net)

View File

@@ -1267,7 +1267,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
{
if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
u32 rt_cookie = 0;
if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
return NULL;
if (rt6_check_expired(rt))
@@ -1335,8 +1337,14 @@ static void ip6_link_failure(struct sk_buff *skb)
if (rt->rt6i_flags & RTF_CACHE) {
dst_hold(&rt->dst);
ip6_del_rt(rt);
} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
rt->rt6i_node->fn_sernum = -1;
} else {
struct fib6_node *fn;
rcu_read_lock();
fn = rcu_dereference(rt->rt6i_node);
if (fn && (rt->rt6i_flags & RTF_DEFAULT))
fn->fn_sernum = -1;
rcu_read_unlock();
}
}
}
@@ -1353,7 +1361,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
return !(rt->rt6i_flags & RTF_CACHE) &&
(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
(rt->rt6i_flags & RTF_PCPU ||
rcu_access_pointer(rt->rt6i_node));
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,

View File

@@ -1381,6 +1381,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
if (!csk)
return -EINVAL;
/* We must prevent loops or risk deadlock ! */
if (csk->sk_family == PF_KCM)
return -EOPNOTSUPP;
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
if (!psock)
return -ENOMEM;

View File

@@ -2151,6 +2151,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct timespec ts;
__u32 ts_status;
bool is_drop_n_account = false;
bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
* We may add members to them until current aligned size without forcing
@@ -2201,8 +2202,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
netoff = TPACKET_ALIGN(po->tp_hdrlen +
(maclen < 16 ? 16 : maclen)) +
po->tp_reserve;
if (po->has_vnet_hdr)
if (po->has_vnet_hdr) {
netoff += sizeof(struct virtio_net_hdr);
do_vnet = true;
}
macoff = netoff - maclen;
}
if (po->tp_version <= TPACKET_V2) {
@@ -2219,8 +2222,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
skb_set_owner_r(copy_skb, sk);
}
snaplen = po->rx_ring.frame_size - macoff;
if ((int)snaplen < 0)
if ((int)snaplen < 0) {
snaplen = 0;
do_vnet = false;
}
}
} else if (unlikely(macoff + snaplen >
GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
@@ -2233,6 +2238,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (unlikely((int)snaplen < 0)) {
snaplen = 0;
macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
do_vnet = false;
}
}
spin_lock(&sk->sk_receive_queue.lock);
@@ -2258,7 +2264,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
}
spin_unlock(&sk->sk_receive_queue.lock);
if (po->has_vnet_hdr) {
if (do_vnet) {
if (__packet_rcv_vnet(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr))) {
spin_lock(&sk->sk_receive_queue.lock);

View File

@@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
info = nla_data(attr);
list_for_each_entry_rcu(laddr, address_list, list) {
memcpy(info, &laddr->a, addrlen);
memcpy(info, &laddr->a, sizeof(laddr->a));
memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a));
info += addrlen;
}
@@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
info = nla_data(attr);
list_for_each_entry(from, &asoc->peer.transport_addr_list,
transports) {
memcpy(info, &from->ipaddr, addrlen);
memcpy(info, &from->ipaddr, sizeof(from->ipaddr));
memset(info + sizeof(from->ipaddr), 0,
addrlen - sizeof(from->ipaddr));
info += addrlen;
}

View File

@@ -4373,8 +4373,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
prim = asoc->peer.primary_path;
memcpy(&info->sctpi_p_address, &prim->ipaddr,
sizeof(struct sockaddr_storage));
memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
info->sctpi_p_state = prim->state;
info->sctpi_p_cwnd = prim->cwnd;
info->sctpi_p_srtt = prim->srtt;

View File

@@ -265,6 +265,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
sctp_ulpq_clear_pd(ulpq);
if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
if (!sock_owned_by_user(sk))
sp->data_ready_signalled = 1;
sk->sk_data_ready(sk);
}