Skip to content

Commit 2b319c4

Browse files
committed
Fix snapshot automount race causing duplicate mounts and AVL tree panic
Multiple threads racing to automount the same snapshot can both spawn mount helper processes that successfully complete, causing both parent threads to attempt AVL tree registration and triggering a VERIFY() panic in avl_add(). This occurs because the fsconfig/fsmount API lacks the serialization provided by traditional mount() via lock_mount(). The fix adds a per-entry mutex (se_mtx) to zfs_snapentry_t that serializes mount and unmount operations on the same snapshot. The first mount thread creates a pending entry with se_spa=NULL and holds se_mtx during the helper execution. Concurrent mounts find the pending entry and return success without spawning duplicate helpers. Unmount waits on se_mtx if a mount is pending, ensuring proper serialization. This allows different snapshots to mount in parallel while preventing the AVL panic. Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
1 parent 7f7d493 commit 2b319c4

File tree

1 file changed

+87
-37
lines changed

1 file changed

+87
-37
lines changed

module/os/linux/zfs/zfs_ctldir.c

Lines changed: 87 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,15 @@ static int zfs_snapshot_no_setuid = 0;
117117
typedef struct {
118118
char *se_name; /* full snapshot name */
119119
char *se_path; /* full mount path */
120-
spa_t *se_spa; /* pool spa */
120+
spa_t *se_spa; /* pool spa (NULL if pending) */
121121
uint64_t se_objsetid; /* snapshot objset id */
122122
struct dentry *se_root_dentry; /* snapshot root dentry */
123123
krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */
124124
taskqid_t se_taskqid; /* scheduled unmount taskqid */
125125
avl_node_t se_node_name; /* zfs_snapshots_by_name link */
126126
avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
127127
zfs_refcount_t se_refcount; /* reference count */
128+
kmutex_t se_mtx; /* serialize mount/unmount/rename */
128129
} zfs_snapentry_t;
129130

130131
static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
@@ -148,6 +149,7 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
148149
se->se_root_dentry = root_dentry;
149150
se->se_taskqid = TASKQID_INVALID;
150151
rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);
152+
mutex_init(&se->se_mtx, NULL, MUTEX_DEFAULT, NULL);
151153

152154
zfs_refcount_create(&se->se_refcount);
153155

@@ -165,6 +167,7 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
165167
kmem_strfree(se->se_name);
166168
kmem_strfree(se->se_path);
167169
rw_destroy(&se->se_taskqid_lock);
170+
mutex_destroy(&se->se_mtx);
168171

169172
kmem_free(se, sizeof (zfs_snapentry_t));
170173
}
@@ -190,34 +193,52 @@ zfsctl_snapshot_rele(zfs_snapentry_t *se)
190193
}
191194

192195
/*
193-
* Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
194-
* zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
195-
* of the trees a reference is held.
196+
* Add a zfs_snapentry_t to the zfs_snapshots_by_name tree. If the entry
197+
* is not pending (se_spa != NULL), also add to zfs_snapshots_by_objsetid.
198+
* While the zfs_snapentry_t is part of the trees a reference is held.
196199
*/
197200
static void
198201
zfsctl_snapshot_add(zfs_snapentry_t *se)
199202
{
200203
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
201204
zfsctl_snapshot_hold(se);
202205
avl_add(&zfs_snapshots_by_name, se);
203-
avl_add(&zfs_snapshots_by_objsetid, se);
206+
if (se->se_spa != NULL)
207+
avl_add(&zfs_snapshots_by_objsetid, se);
204208
}
205209

206210
/*
207-
* Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
208-
* zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
209-
* this can result in the structure being freed if that was the last
210-
* remaining reference.
211+
* Remove a zfs_snapentry_t from the zfs_snapshots_by_name tree and
212+
* zfs_snapshots_by_objsetid tree (if not pending). Upon removal a
213+
* reference is dropped, this can result in the structure being freed
214+
* if that was the last remaining reference.
211215
*/
212216
static void
213217
zfsctl_snapshot_remove(zfs_snapentry_t *se)
214218
{
215219
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
216220
avl_remove(&zfs_snapshots_by_name, se);
217-
avl_remove(&zfs_snapshots_by_objsetid, se);
221+
if (se->se_spa != NULL)
222+
avl_remove(&zfs_snapshots_by_objsetid, se);
218223
zfsctl_snapshot_rele(se);
219224
}
220225

226+
/*
227+
* Fill a pending zfs_snapentry_t after mount succeeds. Fills in the
228+
* remaining fields and adds the entry to the zfs_snapshots_by_objsetid tree.
229+
*/
230+
static void
231+
zfsctl_snapshot_fill(zfs_snapentry_t *se, spa_t *spa, uint64_t objsetid,
232+
struct dentry *root_dentry)
233+
{
234+
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
235+
ASSERT3P(se->se_spa, ==, NULL);
236+
se->se_spa = spa;
237+
se->se_objsetid = objsetid;
238+
se->se_root_dentry = root_dentry;
239+
avl_add(&zfs_snapshots_by_objsetid, se);
240+
}
241+
221242
/*
222243
* Snapshot name comparison function for the zfs_snapshots_by_name.
223244
*/
@@ -315,6 +336,11 @@ zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
315336
se = zfsctl_snapshot_find_by_name(old_snapname);
316337
if (se == NULL)
317338
return (SET_ERROR(ENOENT));
339+
if (se->se_spa == NULL) {
340+
/* Snapshot mount is in progress */
341+
zfsctl_snapshot_rele(se);
342+
return (SET_ERROR(EBUSY));
343+
}
318344

319345
zfsctl_snapshot_remove(se);
320346
kmem_strfree(se->se_name);
@@ -437,26 +463,6 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
437463
return (error);
438464
}
439465

440-
/*
441-
* Check if snapname is currently mounted. Returned non-zero when mounted
442-
* and zero when unmounted.
443-
*/
444-
static boolean_t
445-
zfsctl_snapshot_ismounted(const char *snapname)
446-
{
447-
zfs_snapentry_t *se;
448-
boolean_t ismounted = B_FALSE;
449-
450-
rw_enter(&zfs_snapshot_lock, RW_READER);
451-
if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
452-
zfsctl_snapshot_rele(se);
453-
ismounted = B_TRUE;
454-
}
455-
rw_exit(&zfs_snapshot_lock);
456-
457-
return (ismounted);
458-
}
459-
460466
/*
461467
* Check if the given inode is a part of the virtual .zfs directory.
462468
*/
@@ -1133,13 +1139,22 @@ zfsctl_snapshot_unmount(const char *snapname, int flags)
11331139
}
11341140
rw_exit(&zfs_snapshot_lock);
11351141

1142+
/*
1143+
* Hold se_mtx for the duration of unmount to serialize with
1144+
* any in-progress mount on this snapshot entry. If auto-mount
1145+
* is pending, this waits for it to complete.
1146+
*/
1147+
mutex_enter(&se->se_mtx);
1148+
11361149
exportfs_flush();
11371150

11381151
if (flags & MNT_FORCE)
11391152
argv[4] = "-fn";
11401153
argv[5] = se->se_path;
11411154
dprintf("unmount; path=%s\n", se->se_path);
11421155
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
1156+
1157+
mutex_exit(&se->se_mtx);
11431158
zfsctl_snapshot_rele(se);
11441159

11451160

@@ -1234,14 +1249,31 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12341249
zfs_snapshot_no_setuid ? "nosuid" : "suid");
12351250

12361251
/*
1237-
* Multiple concurrent automounts of a snapshot are never allowed.
1238-
* The snapshot may be manually mounted as many times as desired.
1252+
* Create a pending AVL entry for the snapshot to prevent concurrent
1253+
* automount attempts. If an entry already exists, the snapshot is
1254+
* either mounted or being mounted by another thread, so return
1255+
* success without starting a duplicate mount operation. returning
1256+
* EBUSY would mean that VFS would retry, which we do not want.
12391257
*/
1240-
if (zfsctl_snapshot_ismounted(full_name)) {
1258+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1259+
if ((se = zfsctl_snapshot_find_by_name(full_name)) != NULL) {
1260+
zfsctl_snapshot_rele(se);
1261+
rw_exit(&zfs_snapshot_lock);
12411262
error = 0;
12421263
goto error;
12431264
}
12441265

1266+
/*
1267+
* Create pending entry (se_spa=NULL), add to AVL tree, and acquire
1268+
* se_mtx to serialize with concurrent unmount operations. Release
1269+
* global lock before long-running mount helper.
1270+
*/
1271+
se = zfsctl_snapshot_alloc(full_name, full_path, NULL, 0, NULL);
1272+
zfsctl_snapshot_add(se);
1273+
zfsctl_snapshot_hold(se);
1274+
mutex_enter(&se->se_mtx);
1275+
rw_exit(&zfs_snapshot_lock);
1276+
12451277
/*
12461278
* Attempt to mount the snapshot from user space. Normally this
12471279
* would be done using the vfs_kern_mount() function, however that
@@ -1260,6 +1292,16 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12601292
argv[9] = full_path;
12611293
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
12621294
if (error) {
1295+
/*
1296+
* Mount failed - remove pending entry from AVL, release mutex,
1297+
* and drop mount operation reference.
1298+
*/
1299+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1300+
zfsctl_snapshot_remove(se);
1301+
rw_exit(&zfs_snapshot_lock);
1302+
mutex_exit(&se->se_mtx);
1303+
zfsctl_snapshot_rele(se);
1304+
12631305
if (!(error & MOUNT_BUSY << 8)) {
12641306
zfs_dbgmsg("Unable to automount %s error=%d",
12651307
full_path, error);
@@ -1291,14 +1333,22 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12911333
spath.mnt->mnt_flags |= MNT_SHRINKABLE;
12921334

12931335
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1294-
se = zfsctl_snapshot_alloc(full_name, full_path,
1295-
snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
1296-
dentry);
1297-
zfsctl_snapshot_add(se);
1336+
zfsctl_snapshot_fill(se, snap_zfsvfs->z_os->os_spa,
1337+
dmu_objset_id(snap_zfsvfs->z_os), dentry);
12981338
zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
12991339
rw_exit(&zfs_snapshot_lock);
1340+
} else {
1341+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1342+
zfsctl_snapshot_remove(se);
1343+
rw_exit(&zfs_snapshot_lock);
13001344
}
13011345
path_put(&spath);
1346+
1347+
/*
1348+
* Release mutex and drop mount operation reference.
1349+
*/
1350+
mutex_exit(&se->se_mtx);
1351+
zfsctl_snapshot_rele(se);
13021352
error:
13031353
kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
13041354
kmem_free(full_path, MAXPATHLEN);

0 commit comments

Comments
 (0)