Skip to content

Commit bd5e6ec

Browse files
committed
fs: add infrastructure for multigrain timestamps
JIRA: https://issues.redhat.com/browse/RHEL-121527 The VFS has always used coarse-grained timestamps when updating the ctime and mtime after a change. This has the benefit of allowing filesystems to optimize away a lot metadata updates, down to around 1 per jiffy, even when a file is under heavy writes. Unfortunately, this has always been an issue when we're exporting via NFSv3, which relies on timestamps to validate caches. A lot of changes can happen in a jiffy, so timestamps aren't sufficient to help the client decide when to invalidate the cache. Even with NFSv4, a lot of exported filesystems don't properly support a change attribute and are subject to the same problems with timestamp granularity. Other applications have similar issues with timestamps (e.g backup applications). If fine-grained timestamps were always used, that would improve the situation, but that becomes rather expensive, as the underlying filesystem would have to log a lot more metadata updates. What is needed is a way to only use fine-grained timestamps when they are being actively queried. Use the (unused) top bit in inode->i_ctime_nsec as a flag that indicates whether the current timestamps have been queried via stat() or the like. When it's set, allow the update to use a fine-grained timestamp iff it's necessary to make the ctime show a different value. If it has been queried, then first see whether the current coarse time is later than the existing ctime. If it is, accept that value. If it isn't, then get a fine-grained timestamp and attempt to stamp the inode ctime with that value. If that races with another concurrent stamp, then abandon the update and take the new value without retrying. Filesystems can opt into this by setting the FS_MGTIME fstype flag. Others should be unaffected (other than being subject to the same floor value as multigrain filesystems). Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Jeff Layton <jlayton@kernel.org> Link: https://lore.kernel.org/r/20241002-mgtime-v10-3-d1c4717f5284@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org> (cherry picked from commit 4e40eff) Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
1 parent 189b0ae commit bd5e6ec

File tree

3 files changed

+181
-35
lines changed

3 files changed

+181
-35
lines changed

fs/inode.c

Lines changed: 114 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2210,19 +2210,58 @@ int file_remove_privs(struct file *file)
22102210
}
22112211
EXPORT_SYMBOL(file_remove_privs);
22122212

2213+
/**
2214+
* current_time - Return FS time (possibly fine-grained)
2215+
* @inode: inode.
2216+
*
2217+
* Return the current time truncated to the time granularity supported by
2218+
* the fs, as suitable for a ctime/mtime change. If the ctime is flagged
2219+
* as having been QUERIED, get a fine-grained timestamp, but don't update
2220+
* the floor.
2221+
*
2222+
* For a multigrain inode, this is effectively an estimate of the timestamp
2223+
* that a file would receive. An actual update must go through
2224+
* inode_set_ctime_current().
2225+
*/
2226+
struct timespec64 current_time(struct inode *inode)
2227+
{
2228+
struct timespec64 now;
2229+
u32 cns;
2230+
2231+
ktime_get_coarse_real_ts64_mg(&now);
2232+
2233+
if (!is_mgtime(inode))
2234+
goto out;
2235+
2236+
/* If nothing has queried it, then coarse time is fine */
2237+
cns = smp_load_acquire(&inode->i_ctime_nsec);
2238+
if (cns & I_CTIME_QUERIED) {
2239+
/*
2240+
* If there is no apparent change, then get a fine-grained
2241+
* timestamp.
2242+
*/
2243+
if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
2244+
ktime_get_real_ts64(&now);
2245+
}
2246+
out:
2247+
return timestamp_truncate(now, inode);
2248+
}
2249+
EXPORT_SYMBOL(current_time);
2250+
22132251
static int inode_needs_update_time(struct inode *inode)
22142252
{
2253+
struct timespec64 now, ts;
22152254
int sync_it = 0;
2216-
struct timespec64 now = current_time(inode);
2217-
struct timespec64 ts;
22182255

22192256
/* First try to exhaust all avenues to not sync */
22202257
if (IS_NOCMTIME(inode))
22212258
return 0;
22222259

2260+
now = current_time(inode);
2261+
22232262
ts = inode_get_mtime(inode);
22242263
if (!timespec64_equal(&ts, &now))
2225-
sync_it = S_MTIME;
2264+
sync_it |= S_MTIME;
22262265

22272266
ts = inode_get_ctime(inode);
22282267
if (!timespec64_equal(&ts, &now))
@@ -2599,6 +2638,15 @@ void inode_nohighmem(struct inode *inode)
25992638
}
26002639
EXPORT_SYMBOL(inode_nohighmem);
26012640

2641+
struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
2642+
{
2643+
set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
2644+
inode->i_ctime_sec = ts.tv_sec;
2645+
inode->i_ctime_nsec = ts.tv_nsec;
2646+
return ts;
2647+
}
2648+
EXPORT_SYMBOL(inode_set_ctime_to_ts);
2649+
26022650
/**
26032651
* timestamp_truncate - Truncate timespec to a granularity
26042652
* @t: Timespec
@@ -2631,36 +2679,77 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
26312679
EXPORT_SYMBOL(timestamp_truncate);
26322680

26332681
/**
2634-
* current_time - Return FS time
2635-
* @inode: inode.
2682+
* inode_set_ctime_current - set the ctime to current_time
2683+
* @inode: inode
26362684
*
2637-
* Return the current time truncated to the time granularity supported by
2638-
* the fs.
2685+
* Set the inode's ctime to the current value for the inode. Returns the
2686+
* current value that was assigned. If this is not a multigrain inode, then we
2687+
* set it to the later of the coarse time and floor value.
26392688
*
2640-
* Note that inode and inode->sb cannot be NULL.
2641-
* Otherwise, the function warns and returns time without truncation.
2689+
* If it is multigrain, then we first see if the coarse-grained timestamp is
2690+
* distinct from what is already there. If so, then use that. Otherwise, get a
2691+
* fine-grained timestamp.
2692+
*
2693+
* After that, try to swap the new value into i_ctime_nsec. Accept the
2694+
* resulting ctime, regardless of the outcome of the swap. If it has
2695+
* already been replaced, then that timestamp is later than the earlier
2696+
* unacceptable one, and is thus acceptable.
26422697
*/
2643-
struct timespec64 current_time(struct inode *inode)
2698+
struct timespec64 inode_set_ctime_current(struct inode *inode)
26442699
{
26452700
struct timespec64 now;
2701+
u32 cns, cur;
26462702

2647-
ktime_get_coarse_real_ts64(&now);
2648-
return timestamp_truncate(now, inode);
2649-
}
2650-
EXPORT_SYMBOL(current_time);
2703+
ktime_get_coarse_real_ts64_mg(&now);
2704+
now = timestamp_truncate(now, inode);
26512705

2652-
/**
2653-
* inode_set_ctime_current - set the ctime to current_time
2654-
* @inode: inode
2655-
*
2656-
* Set the inode->i_ctime to the current value for the inode. Returns
2657-
* the current value that was assigned to i_ctime.
2658-
*/
2659-
struct timespec64 inode_set_ctime_current(struct inode *inode)
2660-
{
2661-
struct timespec64 now = current_time(inode);
2706+
/* Just return that if this is not a multigrain fs */
2707+
if (!is_mgtime(inode)) {
2708+
inode_set_ctime_to_ts(inode, now);
2709+
goto out;
2710+
}
26622711

2663-
inode_set_ctime_to_ts(inode, now);
2712+
/*
2713+
* A fine-grained time is only needed if someone has queried
2714+
* for timestamps, and the current coarse grained time isn't
2715+
* later than what's already there.
2716+
*/
2717+
cns = smp_load_acquire(&inode->i_ctime_nsec);
2718+
if (cns & I_CTIME_QUERIED) {
2719+
struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
2720+
.tv_nsec = cns & ~I_CTIME_QUERIED };
2721+
2722+
if (timespec64_compare(&now, &ctime) <= 0) {
2723+
ktime_get_real_ts64_mg(&now);
2724+
now = timestamp_truncate(now, inode);
2725+
}
2726+
}
2727+
2728+
/* No need to cmpxchg if it's exactly the same */
2729+
if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec)
2730+
goto out;
2731+
cur = cns;
2732+
retry:
2733+
/* Try to swap the nsec value into place. */
2734+
if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
2735+
/* If swap occurred, then we're (mostly) done */
2736+
inode->i_ctime_sec = now.tv_sec;
2737+
} else {
2738+
/*
2739+
* Was the change due to someone marking the old ctime QUERIED?
2740+
* If so then retry the swap. This can only happen once since
2741+
* the only way to clear I_CTIME_QUERIED is to stamp the inode
2742+
* with a new ctime.
2743+
*/
2744+
if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
2745+
cns = cur;
2746+
goto retry;
2747+
}
2748+
/* Otherwise, keep the existing ctime */
2749+
now.tv_sec = inode->i_ctime_sec;
2750+
now.tv_nsec = cur & ~I_CTIME_QUERIED;
2751+
}
2752+
out:
26642753
return now;
26652754
}
26662755
EXPORT_SYMBOL(inode_set_ctime_current);

fs/stat.c

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,39 @@
2626
#include "internal.h"
2727
#include "mount.h"
2828

29+
/**
30+
* fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
31+
* @stat: where to store the resulting values
32+
* @request_mask: STATX_* values requested
33+
* @inode: inode from which to grab the c/mtime
34+
*
35+
* Given @inode, grab the ctime and mtime out if it and store the result
36+
* in @stat. When fetching the value, flag it as QUERIED (if not already)
37+
* so the next write will record a distinct timestamp.
38+
*
39+
* NB: The QUERIED flag is tracked in the ctime, but we set it there even
40+
* if only the mtime was requested, as that ensures that the next mtime
41+
* change will be distinct.
42+
*/
43+
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
44+
{
45+
atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;
46+
47+
/* If neither time was requested, then don't report them */
48+
if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
49+
stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
50+
return;
51+
}
52+
53+
stat->mtime = inode_get_mtime(inode);
54+
stat->ctime.tv_sec = inode->i_ctime_sec;
55+
stat->ctime.tv_nsec = (u32)atomic_read(pcn);
56+
if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
57+
stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
58+
stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
59+
}
60+
EXPORT_SYMBOL(fill_mg_cmtime);
61+
2962
/**
3063
* generic_fillattr - Fill in the basic attributes from the inode struct
3164
* @idmap: idmap of the mount the inode was found from
@@ -58,8 +91,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
5891
stat->rdev = inode->i_rdev;
5992
stat->size = i_size_read(inode);
6093
stat->atime = inode_get_atime(inode);
61-
stat->mtime = inode_get_mtime(inode);
62-
stat->ctime = inode_get_ctime(inode);
94+
95+
if (is_mgtime(inode)) {
96+
fill_mg_cmtime(stat, request_mask, inode);
97+
} else {
98+
stat->ctime = inode_get_ctime(inode);
99+
stat->mtime = inode_get_mtime(inode);
100+
}
101+
63102
stat->blksize = i_blocksize(inode);
64103
stat->blocks = inode->i_blocks;
65104

include/linux/fs.h

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1655,14 +1655,25 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode,
16551655
return inode_set_mtime_to_ts(inode, ts);
16561656
}
16571657

1658+
/*
1659+
* Multigrain timestamps
1660+
*
1661+
* Conditionally use fine-grained ctime and mtime timestamps when there
1662+
* are users actively observing them via getattr. The primary use-case
1663+
* for this is NFS clients that use the ctime to distinguish between
1664+
* different states of the file, and that are often fooled by multiple
1665+
* operations that occur in the same coarse-grained timer tick.
1666+
*/
1667+
#define I_CTIME_QUERIED ((u32)BIT(31))
1668+
16581669
static inline time64_t inode_get_ctime_sec(const struct inode *inode)
16591670
{
16601671
return inode->i_ctime_sec;
16611672
}
16621673

16631674
static inline long inode_get_ctime_nsec(const struct inode *inode)
16641675
{
1665-
return inode->i_ctime_nsec;
1676+
return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
16661677
}
16671678

16681679
static inline struct timespec64 inode_get_ctime(const struct inode *inode)
@@ -1673,13 +1684,7 @@ static inline struct timespec64 inode_get_ctime(const struct inode *inode)
16731684
return ts;
16741685
}
16751686

1676-
static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
1677-
struct timespec64 ts)
1678-
{
1679-
inode->i_ctime_sec = ts.tv_sec;
1680-
inode->i_ctime_nsec = ts.tv_nsec;
1681-
return ts;
1682-
}
1687+
struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);
16831688

16841689
/**
16851690
* inode_set_ctime - set the ctime in the inode
@@ -2546,6 +2551,7 @@ struct file_system_type {
25462551
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
25472552
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
25482553
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
2554+
#define FS_MGTIME 64 /* FS uses multigrain timestamps */
25492555
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
25502556
int (*init_fs_context)(struct fs_context *);
25512557
const struct fs_parameter_spec *parameters;
@@ -2569,6 +2575,17 @@ struct file_system_type {
25692575

25702576
#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
25712577

2578+
/**
2579+
* is_mgtime: is this inode using multigrain timestamps
2580+
* @inode: inode to test for multigrain timestamps
2581+
*
2582+
* Return true if the inode uses multigrain timestamps, false otherwise.
2583+
*/
2584+
static inline bool is_mgtime(const struct inode *inode)
2585+
{
2586+
return inode->i_sb->s_type->fs_flags & FS_MGTIME;
2587+
}
2588+
25722589
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
25732590
int flags, const char *dev_name, void *data,
25742591
int (*fill_super)(struct super_block *, void *, int));
@@ -3341,6 +3358,7 @@ extern void page_put_link(void *);
33413358
extern int page_symlink(struct inode *inode, const char *symname, int len);
33423359
extern const struct inode_operations page_symlink_inode_operations;
33433360
extern void kfree_link(void *);
3361+
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
33443362
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
33453363
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
33463364
void generic_fill_statx_atomic_writes(struct kstat *stat,

0 commit comments

Comments
 (0)