Skip to content

Commit b85c564

Browse files
robnbehlendorf
authored andcommitted
libspl/backtrace: comment and harden libunwind backtracer
This is the sort of code that we get right once and never look at again. Anyone reading this code is already likely in the middle of a debugging nightmare, and then they have a wall of manual string construction and an unfamiliar and idiosyncratic library to deal with. So, comment the whole thing to try to make it clear what's going on. In pursuit of the above, I've added return checks to some of the libunwind calls, fixed the frame loop to not skip the "top" frame (however unseful it may be), and fix a couple of calls to spl_bt_u64_to_hex_str() which requested 18 digits instead of 16. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Tino Reichardt <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes #16653
1 parent 2596a75 commit b85c564

File tree

1 file changed

+141
-25
lines changed

1 file changed

+141
-25
lines changed

lib/libspl/backtrace.c

Lines changed: 141 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -79,61 +79,177 @@ libspl_backtrace(int fd)
7979
unw_cursor_t cp;
8080
unw_word_t v;
8181
char buf[128];
82-
size_t n, c;
82+
size_t n;
83+
int err;
8384

85+
/* Snapshot the current frame and state. */
8486
unw_getcontext(&uc);
8587

86-
unw_init_local(&cp, &uc);
88+
/*
89+
* TODO: walk back to the frame that tripped the assertion / the place
90+
* where the signal was recieved.
91+
*/
92+
93+
/*
94+
* Register dump. We're going to loop over all the registers in the
95+
* top frame, and show them, with names, in a nice three-column
96+
* layout, which keeps us within 80 columns.
97+
*/
8798
spl_bt_write(fd, "Registers:\n");
88-
c = 0;
99+
100+
/* Initialise a frame cursor, starting at the current frame */
101+
unw_init_local(&cp, &uc);
102+
103+
/*
104+
* libunwind's list of possible registers for this architecture is an
105+
* enum, unw_regnum_t. UNW_TDEP_LAST_REG is the highest-numbered
106+
* register in that list, however, not all register numbers in this
107+
* range are defined by the architecture, and not all defined registers
108+
* will be present on every implementation of that architecture.
109+
* Moreover, libunwind provides nice names for most, but not all
110+
* registers, but these are hardcoded; a name being available does not
111+
* mean that register is available.
112+
*
113+
* So, we have to pull this all together here. We try to get the value
114+
* of every possible register. If we get a value for it, then the
115+
* register must exist, and so we get its name. If libunwind has no
116+
* name for it, we synthesize something. These cases should be rare,
117+
* and they're usually for uninteresting or niche registers, so it
118+
* shouldn't really matter. We can see the value, and that's the main
119+
* thing.
120+
*/
121+
uint_t cols = 0;
89122
for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) {
123+
/*
124+
* Get the value. Any error probably means the register
125+
* doesn't exist, and we skip it.
126+
*/
90127
if (unw_get_reg(&cp, regnum, &v) < 0)
91128
continue;
129+
130+
/*
131+
* Register name. If libunwind doesn't have a name for it,
132+
* it will return "???". As a shortcut, we just treat '?'
133+
* is an alternate end-of-string character.
134+
*/
92135
const char *name = unw_regname(regnum);
93136
for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {}
94137
if (n == 0) {
138+
/*
139+
* No valid name, so make one of the form "?xx", where
140+
* "xx" is the two-char hex of libunwind's register
141+
* number.
142+
*/
95143
buf[0] = '?';
96144
n = spl_bt_u64_to_hex_str(regnum, 2,
97145
&buf[1], sizeof (buf)-1) + 1;
98146
name = buf;
99147
}
148+
149+
/*
150+
* Two spaces of padding before each column, plus extra
151+
* spaces to align register names shorter than three chars.
152+
*/
100153
spl_bt_write_n(fd, " ", 5-MIN(n, 3));
154+
155+
/* Register name and column punctuation */
101156
spl_bt_write_n(fd, name, n);
102157
spl_bt_write(fd, ": 0x");
103-
n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
158+
159+
/*
160+
* Convert register value (from unw_get_reg()) to hex. We're
161+
* assuming that all registers are 64-bits wide, which is
162+
* probably fine for any general-purpose registers on any
163+
* machine currently in use. A more generic way would be to
164+
* look at the width of unw_word_t, but that would also
165+
* complicate the column code a bit. This is fine.
166+
*/
167+
n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf));
104168
spl_bt_write_n(fd, buf, n);
105-
if (!(++c % 3))
169+
170+
/* Every third column, emit a newline */
171+
if (!(++cols % 3))
106172
spl_bt_write(fd, "\n");
107173
}
108-
if (c % 3)
174+
175+
/* If we finished before the third column, emit a newline. */
176+
if (cols % 3)
109177
spl_bt_write(fd, "\n");
110178

111-
unw_init_local(&cp, &uc);
179+
/* Now the main event, the backtrace. */
112180
spl_bt_write(fd, "Call trace:\n");
113-
while (unw_step(&cp) > 0) {
114-
unw_get_reg(&cp, UNW_REG_IP, &v);
181+
182+
/* Reset the cursor to the top again. */
183+
unw_init_local(&cp, &uc);
184+
185+
do {
186+
/*
187+
* Getting the IP should never fail; libunwind handles it
188+
* specially, because its used a lot internally. Still, no
189+
* point being silly about it, as the last thing we want is
190+
* our crash handler to crash. So if it ever does fail, we'll
191+
* show an error line, but keep going to the next frame.
192+
*/
193+
if (unw_get_reg(&cp, UNW_REG_IP, &v) < 0) {
194+
spl_bt_write(fd, " [couldn't get IP register; "
195+
"corrupt frame?]");
196+
continue;
197+
}
198+
199+
/* IP & punctuation */
200+
n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf));
115201
spl_bt_write(fd, " [0x");
116-
n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
117202
spl_bt_write_n(fd, buf, n);
118203
spl_bt_write(fd, "] ");
119-
unw_get_proc_name(&cp, buf, sizeof (buf), &v);
120-
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
121-
spl_bt_write_n(fd, buf, n);
122-
spl_bt_write(fd, "+0x");
123-
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
124-
spl_bt_write_n(fd, buf, n);
204+
205+
/*
206+
* Function ("procedure") name for the current frame. `v`
207+
* receives the offset from the named function to the IP, which
208+
* we show as a "+offset" suffix.
209+
*
210+
* If libunwind can't determine the name, we just show "???"
211+
* instead. We've already displayed the IP above; that will
212+
* have to do.
213+
*
214+
* unw_get_proc_name() will return ENOMEM if the buffer is too
215+
* small, instead truncating the name. So we treat that as a
216+
* success and use whatever is in the buffer.
217+
*/
218+
err = unw_get_proc_name(&cp, buf, sizeof (buf), &v);
219+
if (err == 0 || err == -UNW_ENOMEM) {
220+
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
221+
spl_bt_write_n(fd, buf, n);
222+
223+
/* Offset from proc name */
224+
spl_bt_write(fd, "+0x");
225+
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
226+
spl_bt_write_n(fd, buf, n);
227+
} else
228+
spl_bt_write(fd, "???");
229+
125230
#ifdef HAVE_LIBUNWIND_ELF
126-
spl_bt_write(fd, " (in ");
127-
unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
128-
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
129-
spl_bt_write_n(fd, buf, n);
130-
spl_bt_write(fd, " +0x");
131-
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
132-
spl_bt_write_n(fd, buf, n);
133-
spl_bt_write(fd, ")");
231+
/*
232+
* Newer libunwind has unw_get_elf_filename(), which gets
233+
* the name of the ELF object that the frame was executing in.
234+
* Like `unw_get_proc_name()`, `v` recieves the offset within
235+
* the file, and UNW_ENOMEM indicates that a truncate filename
236+
* was left in the buffer.
237+
*/
238+
err = unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
239+
if (err == 0 || err == -UNW_ENOMEM) {
240+
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
241+
spl_bt_write(fd, " (in ");
242+
spl_bt_write_n(fd, buf, n);
243+
244+
/* Offset within file */
245+
spl_bt_write(fd, " +0x");
246+
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
247+
spl_bt_write_n(fd, buf, n);
248+
spl_bt_write(fd, ")");
249+
}
134250
#endif
135251
spl_bt_write(fd, "\n");
136-
}
252+
} while (unw_step(&cp) > 0);
137253
}
138254
#elif defined(HAVE_BACKTRACE)
139255
#include <execinfo.h>

0 commit comments

Comments
 (0)