Skip to content

Commit a1e166b

Browse files
committed
feat(debugger): add special handling for very large collections/objects
Handle edge cases where an object has a lot of properties, or where collections (arrays/sets/maps) has a lot of elements. In those cases, a single Chrome DevTools Protocol `Debugger.getProperties` request will take a long time to serialize their properties/elements, which can siginificantly overshoot the time budget (as ongoing CDP requests are allowed to complete before aborting) This is dealt with in the following way: - Skip enumerating large collections (arrays/maps/sets) by parsing size from descriptions; mark with notCapturedReason: "collectionSize" and record size. - Large objects are still collected the first time, but objects with >500 properties set a fatalSnapshotError to prevent future snapshot attempts for that probe and emits error to the diagnostics endpoint.
1 parent 2dd3573 commit a1e166b

File tree

16 files changed

+386
-121
lines changed

16 files changed

+386
-121
lines changed

integration-tests/debugger/snapshot-time-budget.spec.js

Lines changed: 173 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22

33
const assert = require('node:assert')
44
const { setup } = require('./utils')
5+
const {
6+
DEFAULT_MAX_COLLECTION_SIZE,
7+
LARGE_OBJECT_SKIP_THRESHOLD
8+
} = require('../../packages/dd-trace/src/debugger/devtools_client/snapshot/constants')
59

610
describe('Dynamic Instrumentation', function () {
711
describe('input messages', function () {
8-
describe('with snapshot under tight time budget', function () {
12+
describe('with snapshot under time budget', function () {
913
context('1ms time budget', function () {
1014
// Force a very small time budget in ms to trigger partial snapshots
1115
const budget = 1
@@ -17,57 +21,191 @@ describe('Dynamic Instrumentation', function () {
1721
it(
1822
'should include partial snapshot marked with notCapturedReason: timeout',
1923
// A tolerance of 15ms is used to avoid flakiness
20-
test({ t, maxPausedTime: budget + 15, breakpointIndex: 0, maxReferenceDepth: 5 })
24+
test({ t, maxPausedTime: budget + 15, breakpointIndex: 0, maxReferenceDepth: 5 }, (locals) => {
25+
assert.strictEqual(
26+
containsTimeBudget(locals),
27+
true,
28+
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
29+
)
30+
})
2131
)
2232
})
2333

2434
context('default time budget', function () {
25-
const budget = 15 // default time budget in ms
2635
const t = setup({ dependencies: ['fastify'] })
2736

28-
// TODO: Make this pass
29-
// eslint-disable-next-line mocha/no-pending-tests
30-
it.skip(
31-
'should keep budget when state includes an object with 1 million properties',
32-
// A tolerance of 5ms is used to avoid flakiness
33-
test({ t, maxPausedTime: budget + 5, breakpointIndex: 1, maxReferenceDepth: 1 })
37+
it(
38+
'should timeout first, then disable subsequent snapshots and emit error diagnostics',
39+
async function () {
40+
const breakpoint = t.breakpoints[1]
41+
42+
// Listen for the first snapshot payload (should contain notCapturedReason: "timeout")
43+
const firstPayloadReceived = new Promise((resolve) => {
44+
t.agent.once('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
45+
const { locals } = captures.lines[breakpoint.line]
46+
resolve(locals)
47+
})
48+
})
49+
50+
// Prepare to assert that an ERROR diagnostics event with exception details is emitted
51+
const errorDiagnosticsReceived = new Promise((/** @type {(value?: unknown) => void} */ resolve, reject) => {
52+
const handler = ({ payload }) => {
53+
payload.forEach(({ debugger: { diagnostics } }) => {
54+
if (diagnostics.status !== 'ERROR') return
55+
try {
56+
assert.strictEqual(
57+
diagnostics.exception.message,
58+
'An object with 1000000 properties was detected while collecting a snapshot. This exceeds the ' +
59+
'maximum number of allowed properties of 500. Future snapshots for existing probes in this ' +
60+
'location will be skipped until the Node.js process is restarted'
61+
)
62+
resolve()
63+
} catch (e) {
64+
reject(e)
65+
} finally {
66+
t.agent.off('debugger-diagnostics', handler)
67+
}
68+
})
69+
}
70+
t.agent.on('debugger-diagnostics', handler)
71+
})
72+
73+
// Install probe with snapshot capture enabled
74+
t.agent.addRemoteConfig(breakpoint.generateRemoteConfig({
75+
captureSnapshot: true,
76+
capture: { maxReferenceDepth: 1 }
77+
}))
78+
79+
// Trigger once; this run is expected to be slow and mark fields with "timeout"
80+
const result1 = await breakpoint.triggerBreakpoint()
81+
assert.ok(
82+
result1.data.paused >= 1_000,
83+
`expected thread to be paused for at least 1 second, but was paused for ~${result1.data.paused}ms`
84+
)
85+
const locals = await firstPayloadReceived
86+
assert.strictEqual(
87+
containsTimeBudget(locals),
88+
true,
89+
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
90+
)
91+
await errorDiagnosticsReceived
92+
93+
// Prepare to assert that no snapshot is produced on a subsequent trigger
94+
const noSnapshotAfterSecondTrigger = new Promise((/** @type {(value?: unknown) => void} */ resolve) => {
95+
t.agent.once('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
96+
assert.strictEqual(captures, undefined)
97+
resolve()
98+
})
99+
})
100+
101+
// Trigger the same breakpoint again directly
102+
const result2 = await t.axios.get(breakpoint.url)
103+
assert.ok(
104+
result2.data.paused <= 5,
105+
`expected thread to be paused <=5ms, but was paused for ~${result2.data.paused}ms`
106+
)
107+
108+
await noSnapshotAfterSecondTrigger
109+
}
34110
)
111+
})
112+
113+
context('large object thresholds', function () {
114+
// The tests in this group, should take a lot longer than 100ms to capture a snapshot if no thresholds are
115+
// applied. But they should take a lot less than the 100ms budget if thresholds are applied. Using 100ms means
116+
// that the tests should not be flaky, but still fail if the thresholds are not applied.
117+
const budget = 100
118+
const t = setup({
119+
dependencies: ['fastify'],
120+
env: { DD_DYNAMIC_INSTRUMENTATION_CAPTURE_TIMEOUT_MS: String(budget) }
121+
})
35122

36-
// TODO: Make this pass
37-
// eslint-disable-next-line mocha/no-pending-tests
38-
it.skip(
39-
'should keep budget when state includes an array of 1 million primitives',
40-
// A tolerance of 5ms is used to avoid flakiness
41-
test({ t, maxPausedTime: budget + 5, breakpointIndex: 2, maxReferenceDepth: 1 })
123+
it(
124+
'should keep budget when state includes collections with 1 million elements',
125+
test({ t, maxPausedTime: budget, breakpointIndex: 2, maxReferenceDepth: 1 }, (locals) => {
126+
const notCapturedReason = `Large collection with too many elements (skip threshold: ${
127+
LARGE_OBJECT_SKIP_THRESHOLD
128+
})`
129+
assert.strictEqual(locals.arrOfPrimitives.notCapturedReason, notCapturedReason)
130+
assert.strictEqual(locals.arrOfPrimitives.size, 1_000_000)
131+
assert.strictEqual(locals.arrOfPrimitives.elements.length, 0)
132+
assert.strictEqual(locals.arrOfObjects.notCapturedReason, notCapturedReason)
133+
assert.strictEqual(locals.arrOfObjects.size, 1_000_000)
134+
assert.strictEqual(locals.arrOfObjects.elements.length, 0)
135+
assert.strictEqual(locals.map.notCapturedReason, notCapturedReason)
136+
assert.strictEqual(locals.map.size, 1_000_000)
137+
assert.strictEqual(locals.map.entries.length, 0)
138+
assert.strictEqual(locals.set.notCapturedReason, notCapturedReason)
139+
assert.strictEqual(locals.set.size, 1_000_000)
140+
assert.strictEqual(locals.set.elements.length, 0)
141+
})
42142
)
43143

44-
// TODO: Make this pass
45-
// eslint-disable-next-line mocha/no-pending-tests
46-
it.skip(
47-
'should keep budget when state includes an array of 1 million objects',
48-
// A tolerance of 5ms is used to avoid flakiness
49-
test({ t, maxPausedTime: budget + 5, breakpointIndex: 3, maxReferenceDepth: 1 })
144+
it(
145+
'should keep budget when state includes collections with less than the size threshold',
146+
test({ t, maxPausedTime: budget, breakpointIndex: 3, maxReferenceDepth: 1 }, (locals) => {
147+
assert.strictEqual(locals.arrOfPrimitives.notCapturedReason, 'collectionSize')
148+
assert.strictEqual(locals.arrOfPrimitives.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
149+
assert.strictEqual(locals.arrOfPrimitives.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
150+
assert.strictEqual(locals.arrOfObjects.notCapturedReason, 'collectionSize')
151+
assert.strictEqual(locals.arrOfObjects.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
152+
assert.strictEqual(locals.arrOfObjects.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
153+
assert.strictEqual(locals.map.notCapturedReason, 'collectionSize')
154+
assert.strictEqual(locals.map.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
155+
assert.strictEqual(locals.map.entries.length, DEFAULT_MAX_COLLECTION_SIZE)
156+
assert.strictEqual(locals.set.notCapturedReason, 'collectionSize')
157+
assert.strictEqual(locals.set.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
158+
assert.strictEqual(locals.set.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
159+
})
50160
)
51161
})
162+
163+
context('fuzzing', function () {
164+
for (let budget = 0; budget < 20; budget++) {
165+
context(`graceful handling with time budget of ${budget}ms`, function () {
166+
// Anything longer than this, and the debugger worker thread most likely crashed.
167+
// Run test with `DD_TRACE_DEBUG=true` to see more.
168+
this.timeout(2000)
169+
170+
const t = setup({
171+
dependencies: ['fastify'],
172+
env: { DD_DYNAMIC_INSTRUMENTATION_CAPTURE_TIMEOUT_MS: String(budget) }
173+
})
174+
175+
// If this test uncovers any issues, it will show itself as being flaky, as the exact timing of how long it
176+
// takes to collect the snapshot is not deterministic.
177+
it('should send a probe result to the agent', async function () {
178+
t.agent.addRemoteConfig(t.generateRemoteConfig({
179+
captureSnapshot: true,
180+
capture: { maxReferenceDepth: 5 }
181+
}))
182+
t.triggerBreakpoint()
183+
184+
const { captures } = await t.snapshotReceived()
185+
// If the snapshot contains a `captures` field, it means it was captured and processes without any issues
186+
assert.ok(
187+
captures !== null && typeof captures === 'object',
188+
'expected snapshot to contain a `captures` object'
189+
)
190+
// To make this test more future-proof, we also assert that the snapshot contains at least one local
191+
// property, though currently this is not necessary.
192+
assert.ok(
193+
Object.keys(captures.lines[t.breakpoint.line].locals).length > 0,
194+
'expected snapshot to contain at least one local property'
195+
)
196+
})
197+
})
198+
}
199+
})
52200
})
53201
})
54202
})
55203

56-
function test ({ t, maxPausedTime, breakpointIndex, maxReferenceDepth }) {
204+
function test ({ t, maxPausedTime = 0, breakpointIndex, maxReferenceDepth }, assertFn) {
57205
const breakpoint = t.breakpoints[breakpointIndex]
58206

59207
return async function () {
60-
const payloadReceived = new Promise((/** @type {(value?: unknown) => void} */ resolve) => {
61-
t.agent.on('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
62-
const { locals } = captures.lines[breakpoint.line]
63-
assert.strictEqual(
64-
containsTimeBudget(locals),
65-
true,
66-
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
67-
)
68-
resolve()
69-
})
70-
})
208+
const snapshotPromise = t.snapshotReceived()
71209

72210
t.agent.addRemoteConfig(breakpoint.generateRemoteConfig({
73211
captureSnapshot: true,
@@ -81,7 +219,8 @@ function test ({ t, maxPausedTime, breakpointIndex, maxReferenceDepth }) {
81219
`expected thread to be paused <=${maxPausedTime}ms, but was paused for ~${data.paused}ms`
82220
)
83221

84-
await payloadReceived
222+
const snapshot = await snapshotPromise
223+
assertFn?.(snapshot.captures.lines[breakpoint.line].locals)
85224
}
86225
}
87226

integration-tests/debugger/target-app/snapshot-time-budget.js

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
require('dd-trace/init')
44

55
const Fastify = require('fastify')
6+
const {
7+
LARGE_OBJECT_SKIP_THRESHOLD
8+
} = require('dd-trace/packages/dd-trace/src/debugger/devtools_client/snapshot/constants')
69

710
const fastify = Fastify({ logger: { level: 'error' } })
811

@@ -16,24 +19,31 @@ fastify.get('/deeply-nested-large-object', function handler () {
1619
return { paused: Number(diff) / 1_000_000 }
1720
})
1821

19-
fastify.get('/large-object-of-primitives', function handler () {
22+
fastify.get('/object-with-many-properties', function handler () {
2023
const obj = generateObject(0, 1_000_000) // eslint-disable-line no-unused-vars
2124
const start = process.hrtime.bigint()
22-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-object-of-primitives
25+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /object-with-many-properties
2326
return { paused: Number(diff) / 1_000_000 }
2427
})
2528

26-
fastify.get('/large-array-of-primitives', function handler () {
27-
const arr = Array.from({ length: 1_000_000 }, (_, i) => i) // eslint-disable-line no-unused-vars
29+
fastify.get('/large-collections', function handler () {
30+
const arrOfPrimitives = Array.from({ length: 1_000_000 }, (_, i) => i) // eslint-disable-line no-unused-vars
31+
const arrOfObjects = Array.from({ length: 1_000_000 }, (_, i) => ({ i }))
32+
const map = new Map(arrOfObjects.map((obj, i) => [i, obj])) // eslint-disable-line no-unused-vars
33+
const set = new Set(arrOfObjects) // eslint-disable-line no-unused-vars
2834
const start = process.hrtime.bigint()
29-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-array-of-primitives
35+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-collections
3036
return { paused: Number(diff) / 1_000_000 }
3137
})
3238

33-
fastify.get('/large-array-of-objects', function handler () {
34-
const arr = Array.from({ length: 1_000_000 }, (_, i) => ({ i })) // eslint-disable-line no-unused-vars
39+
fastify.get('/large-collections-below-size-threshold', function handler () {
40+
// eslint-disable-next-line no-unused-vars
41+
const arrOfPrimitives = Array.from({ length: LARGE_OBJECT_SKIP_THRESHOLD - 1 }, (_, i) => i)
42+
const arrOfObjects = Array.from({ length: LARGE_OBJECT_SKIP_THRESHOLD - 1 }, (_, i) => ({ i }))
43+
const map = new Map(arrOfObjects.map((obj, i) => [i, obj])) // eslint-disable-line no-unused-vars
44+
const set = new Set(arrOfObjects) // eslint-disable-line no-unused-vars
3545
const start = process.hrtime.bigint()
36-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-array-of-objects
46+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-collections-below-size-threshold
3747
return { paused: Number(diff) / 1_000_000 }
3848
})
3949

integration-tests/debugger/utils.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,15 @@ function setup ({ env, testApp, testAppSource, dependencies, silent, stdioHandle
3838
rcConfig: null,
3939
triggerBreakpoint: triggerBreakpoint.bind(null, breakpoints[0].url),
4040
generateRemoteConfig: generateRemoteConfig.bind(null, breakpoints[0]),
41-
generateProbeConfig: generateProbeConfig.bind(null, breakpoints[0])
41+
generateProbeConfig: generateProbeConfig.bind(null, breakpoints[0]),
42+
43+
snapshotReceived () {
44+
return new Promise((/** @type {(value?: object) => void} */ resolve) => {
45+
t.agent.on('debugger-input', ({ payload: [{ debugger: { snapshot } }] }) => {
46+
resolve(snapshot)
47+
})
48+
})
49+
}
4250
}
4351

4452
// Allow specific access to each breakpoint

packages/dd-trace/src/ci-visibility/dynamic-instrumentation/worker/index.js

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ session.on('Debugger.paused', async ({ params: { hitBreakpoints: [hitBreakpoint]
3636

3737
const stack = getStackFromCallFrames(callFrames)
3838

39-
const getLocalState = await getLocalStateForCallFrame(callFrames[0])
39+
const { processLocalState } = await getLocalStateForCallFrame(callFrames[0])
4040

4141
await session.post('Debugger.resume')
4242

@@ -48,17 +48,13 @@ session.on('Debugger.paused', async ({ params: { hitBreakpoints: [hitBreakpoint]
4848
version: '0',
4949
location: probe.location
5050
},
51+
captures: {
52+
lines: { [probe.location.lines[0]]: { locals: processLocalState() } }
53+
},
5154
stack,
5255
language: 'javascript'
5356
}
5457

55-
const state = getLocalState()
56-
if (state) {
57-
snapshot.captures = {
58-
lines: { [probe.location.lines[0]]: { locals: state } }
59-
}
60-
}
61-
6258
breakpointHitChannel.postMessage({ snapshot })
6359
})
6460

0 commit comments

Comments
 (0)