Skip to content

Commit c2b1f1b

Browse files
harishchegondiashutoshx
authored andcommitted
drm/xe/eustall: Resolve a possible circular locking dependency
Use a separate lock in the polling function eu_stall_data_buf_poll() instead of eu_stall->stream_lock. This would prevent a possible circular locking dependency leading to a deadlock as described below. This would also require additional locking with the new lock in the read function. <4> [787.192986] ====================================================== <4> [787.192988] WARNING: possible circular locking dependency detected <4> [787.192991] 6.14.0-rc7-xe+ #1 Tainted: G U <4> [787.192993] ------------------------------------------------------ <4> [787.192994] xe_eu_stall/20093 is trying to acquire lock: <4> [787.192996] ffff88819847e2c0 ((work_completion) (&(&stream->buf_poll_work)->work)), at: __flush_work+0x1f8/0x5e0 <4> [787.193005] but task is already holding lock: <4> [787.193007] ffff88814ce83ba8 (&gt->eu_stall->stream_lock){3:3}, at: xe_eu_stall_stream_ioctl+0x41/0x6a0 [xe] <4> [787.193090] which lock already depends on the new lock. <4> [787.193093] the existing dependency chain (in reverse order) is: <4> [787.193095] -> #1 (&gt->eu_stall->stream_lock){+.+.}-{3:3}: <4> [787.193099] __mutex_lock+0xb4/0xe40 <4> [787.193104] mutex_lock_nested+0x1b/0x30 <4> [787.193106] eu_stall_data_buf_poll_work_fn+0x44/0x1d0 [xe] <4> [787.193155] process_one_work+0x21c/0x740 <4> [787.193159] worker_thread+0x1db/0x3c0 <4> [787.193161] kthread+0x10d/0x270 <4> [787.193164] ret_from_fork+0x44/0x70 <4> [787.193168] ret_from_fork_asm+0x1a/0x30 <4> [787.193172] -> #0 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}: <4> [787.193176] __lock_acquire+0x1637/0x2810 <4> [787.193180] lock_acquire+0xc9/0x300 <4> [787.193183] __flush_work+0x219/0x5e0 <4> [787.193186] cancel_delayed_work_sync+0x87/0x90 <4> [787.193189] xe_eu_stall_disable_locked+0x9a/0x260 [xe] <4> [787.193237] xe_eu_stall_stream_ioctl+0x5b/0x6a0 [xe] <4> [787.193285] __x64_sys_ioctl+0xa4/0xe0 <4> [787.193289] x64_sys_call+0x131e/0x2650 <4> [787.193292] do_syscall_64+0x91/0x180 <4> [787.193295] entry_SYSCALL_64_after_hwframe+0x76/0x7e <4> [787.193299] other info that might help us debug this: <4> [787.193302] Possible unsafe locking scenario: <4> [787.193304] CPU0 CPU1 <4> [787.193305] ---- ---- <4> [787.193306] lock(&gt->eu_stall->stream_lock); <4> [787.193308] lock((work_completion) (&(&stream->buf_poll_work)->work)); <4> [787.193311] lock(&gt->eu_stall->stream_lock); <4> [787.193313] lock((work_completion) (&(&stream->buf_poll_work)->work)); <4> [787.193315] *** DEADLOCK *** Fixes: 760edec ("drm/xe/eustall: Add support to read() and poll() EU stall data") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4598 Signed-off-by: Harish Chegondi <[email protected]> Reviewed-by: Ashutosh Dixit <[email protected]> Signed-off-by: Ashutosh Dixit <[email protected]> Link: https://lore.kernel.org/r/c896932fca84f79db2df5942911997ed77b2b9b6.1744934656.git.harish.chegondi@intel.com
1 parent 238ae3b commit c2b1f1b

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

drivers/gpu/drm/xe/xe_eu_stall.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ struct xe_eu_stall_data_stream {
5252

5353
struct xe_gt *gt;
5454
struct xe_bo *bo;
55+
/* Lock to protect data buffer pointers */
56+
struct mutex xecore_buf_lock;
5557
struct per_xecore_buf *xecore_buf;
5658
struct {
5759
bool reported_to_user;
@@ -378,7 +380,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
378380
u16 group, instance;
379381
unsigned int xecore;
380382

381-
mutex_lock(&gt->eu_stall->stream_lock);
383+
mutex_lock(&stream->xecore_buf_lock);
382384
for_each_dss_steering(xecore, gt, group, instance) {
383385
xecore_buf = &stream->xecore_buf[xecore];
384386
read_ptr = xecore_buf->read;
@@ -396,7 +398,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
396398
set_bit(xecore, stream->data_drop.mask);
397399
xecore_buf->write = write_ptr;
398400
}
399-
mutex_unlock(&gt->eu_stall->stream_lock);
401+
mutex_unlock(&stream->xecore_buf_lock);
400402

401403
return min_data_present;
402404
}
@@ -511,11 +513,13 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
511513
unsigned int xecore;
512514
int ret = 0;
513515

516+
mutex_lock(&stream->xecore_buf_lock);
514517
if (bitmap_weight(stream->data_drop.mask, XE_MAX_DSS_FUSE_BITS)) {
515518
if (!stream->data_drop.reported_to_user) {
516519
stream->data_drop.reported_to_user = true;
517520
xe_gt_dbg(gt, "EU stall data dropped in XeCores: %*pb\n",
518521
XE_MAX_DSS_FUSE_BITS, stream->data_drop.mask);
522+
mutex_unlock(&stream->xecore_buf_lock);
519523
return -EIO;
520524
}
521525
stream->data_drop.reported_to_user = false;
@@ -527,6 +531,7 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
527531
if (ret || count == total_size)
528532
break;
529533
}
534+
mutex_unlock(&stream->xecore_buf_lock);
530535
return total_size ?: (ret ?: -EAGAIN);
531536
}
532537

@@ -583,6 +588,7 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
583588
{
584589
struct xe_gt *gt = stream->gt;
585590

591+
mutex_destroy(&stream->xecore_buf_lock);
586592
gt->eu_stall->stream = NULL;
587593
kfree(stream);
588594
}
@@ -718,6 +724,7 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
718724
}
719725

720726
init_waitqueue_head(&stream->poll_wq);
727+
mutex_init(&stream->xecore_buf_lock);
721728
INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
722729
stream->per_xecore_buf_size = per_xecore_buf_size;
723730
stream->sampling_rate_mult = props->sampling_rate_mult;

0 commit comments

Comments
 (0)