Skip to content

Commit f42bb33

Browse files
committed
feat(debugger): add special handling for very large collections/objects
Handle edge cases where an object has a lot of properties, or where collections (arrays/sets/maps) has a lot of elements. In those cases, a single Chrome DevTools Protocol `Debugger.getProperties` request will take a long time to serialize their properties/elements, which can siginificantly overshoot the time budget (as ongoing CDP requests are allowed to complete before aborting) This is dealt with in the following way: - Skip enumerating large collections (arrays/maps/sets) by parsing size from descriptions; mark with notCapturedReason: "collectionSize" and record size. - Large objects are still collected the first time, but objects with >500 properties set a fatalSnapshotError to prevent future snapshot attempts for that probe and emits error to the diagnostics endpoint.
1 parent cd88600 commit f42bb33

File tree

9 files changed

+203
-53
lines changed

9 files changed

+203
-53
lines changed

integration-tests/debugger/snapshot-time-budget.spec.js

Lines changed: 119 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
const assert = require('node:assert')
44
const { setup } = require('./utils')
55

6-
const tolerance = 5
6+
const DEFAULT_MAX_COLLECTION_SIZE = 100
7+
const COLLECTION_SIZE_THRESHOLD = 500
78

89
describe('Dynamic Instrumentation', function () {
910
describe('input messages', function () {
@@ -18,51 +19,146 @@ describe('Dynamic Instrumentation', function () {
1819

1920
it(
2021
'should include partial snapshot marked with notCapturedReason: timeout',
21-
test({ t, maxPausedTime: target + tolerance, breakpointIndex: 0, maxReferenceDepth: 5 })
22+
// A tolerance of 10ms is used to avoid flakiness
23+
test({ t, maxPausedTime: target + 10, breakpointIndex: 0, maxReferenceDepth: 5 }, (locals) => {
24+
assert.strictEqual(
25+
containsTimeBudget(locals),
26+
true,
27+
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
28+
)
29+
})
2230
)
2331
})
2432

2533
context('default time budget', function () {
2634
const target = 10 // default time budget in ms
2735
const t = setup({ dependencies: ['fastify'] })
2836

29-
// TODO: Make this pass
30-
// eslint-disable-next-line mocha/no-pending-tests
31-
it.skip(
32-
'should keep budget when state includes an object with 1 million properties',
33-
test({ t, maxPausedTime: target + tolerance, breakpointIndex: 1, maxReferenceDepth: 1 })
37+
it(
38+
'should timeout first, then disable subsequent snapshots and emit error diagnostics',
39+
async function () {
40+
const breakpoint = t.breakpoints[1]
41+
42+
// Listen for the first snapshot payload (should contain notCapturedReason: "timeout")
43+
const firstPayloadReceived = new Promise((resolve) => {
44+
t.agent.once('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
45+
const { locals } = captures.lines[breakpoint.line]
46+
resolve(locals)
47+
})
48+
})
49+
50+
// Prepare to assert that an ERROR diagnostics event with exception details is emitted
51+
const errorDiagnosticsReceived = new Promise((/** @type {(value?: unknown) => void} */ resolve, reject) => {
52+
const handler = ({ payload }) => {
53+
payload.forEach(({ debugger: { diagnostics } }) => {
54+
if (diagnostics.status !== 'ERROR') return
55+
try {
56+
assert.strictEqual(
57+
diagnostics.exception.message,
58+
'An object with more than 500 properties was detected while collecting a snapshot. Future ' +
59+
'snapshots for exising probes in this location will be skipped until the Node.js process is ' +
60+
'restarted'
61+
)
62+
resolve()
63+
} catch (e) {
64+
reject(e)
65+
} finally {
66+
t.agent.off('debugger-diagnostics', handler)
67+
}
68+
})
69+
}
70+
t.agent.on('debugger-diagnostics', handler)
71+
})
72+
73+
// Install probe with snapshot capture enabled
74+
t.agent.addRemoteConfig(breakpoint.generateRemoteConfig({
75+
captureSnapshot: true,
76+
capture: { maxReferenceDepth: 1 }
77+
}))
78+
79+
// Trigger once; this run is expected to be slow and mark fields with "timeout"
80+
const result1 = await breakpoint.triggerBreakpoint()
81+
assert.ok(
82+
result1.data.paused >= 1_000,
83+
`expected thread to be paused for at least 1 second, but was paused for ~${result1.data.paused}ms`
84+
)
85+
const locals = await firstPayloadReceived
86+
assert.strictEqual(
87+
containsTimeBudget(locals),
88+
true,
89+
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
90+
)
91+
await errorDiagnosticsReceived
92+
93+
// Prepare to assert that no snapshot is produced on a subsequent trigger
94+
const noSnapshotAfterSecondTrigger = new Promise((/** @type {(value?: unknown) => void} */ resolve) => {
95+
t.agent.once('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
96+
assert.strictEqual(captures, undefined)
97+
resolve()
98+
})
99+
})
100+
101+
// Trigger the same breakpoint again directly
102+
const result2 = await t.axios.get(breakpoint.url)
103+
assert.ok(
104+
result2.data.paused <= 5,
105+
`expected thread to be paused <=5ms, but was paused for ~${result2.data.paused}ms`
106+
)
107+
108+
await noSnapshotAfterSecondTrigger
109+
}
34110
)
35111

36-
// TODO: Make this pass
37-
// eslint-disable-next-line mocha/no-pending-tests
38-
it.skip(
39-
'should keep budget when state includes an array of 1 million primitives',
40-
test({ t, maxPausedTime: target + tolerance, breakpointIndex: 2, maxReferenceDepth: 1 })
112+
it(
113+
'should keep budget when state includes collections with 1 million elements',
114+
// A tolerance of 5ms is used to avoid flakiness
115+
test({ t, maxPausedTime: target + 5, breakpointIndex: 2, maxReferenceDepth: 1 }, (locals) => {
116+
assert.strictEqual(locals.arrOfPrimitives.notCapturedReason, 'collectionSize')
117+
assert.strictEqual(locals.arrOfPrimitives.size, 1_000_000)
118+
assert.strictEqual(locals.arrOfPrimitives.elements.length, 0)
119+
assert.strictEqual(locals.arrOfObjects.notCapturedReason, 'collectionSize')
120+
assert.strictEqual(locals.arrOfObjects.size, 1_000_000)
121+
assert.strictEqual(locals.arrOfObjects.elements.length, 0)
122+
assert.strictEqual(locals.map.notCapturedReason, 'collectionSize')
123+
assert.strictEqual(locals.map.size, 1_000_000)
124+
assert.strictEqual(locals.map.entries.length, 0)
125+
assert.strictEqual(locals.set.notCapturedReason, 'collectionSize')
126+
assert.strictEqual(locals.set.size, 1_000_000)
127+
assert.strictEqual(locals.set.elements.length, 0)
128+
})
41129
)
42130

43-
// TODO: Make this pass
44-
// eslint-disable-next-line mocha/no-pending-tests
45-
it.skip(
46-
'should keep budget when state includes an array of 1 million objects',
47-
test({ t, maxPausedTime: target + tolerance, breakpointIndex: 3, maxReferenceDepth: 1 })
131+
it(
132+
'should keep budget when state includes collections with less than the size threshold',
133+
// A tolerance of 30ms is used to avoid flakiness
134+
test({ t, maxPausedTime: target + 30, breakpointIndex: 3, maxReferenceDepth: 1 }, (locals) => {
135+
assert.strictEqual(locals.arrOfPrimitives.notCapturedReason, 'collectionSize')
136+
assert.strictEqual(locals.arrOfPrimitives.size, COLLECTION_SIZE_THRESHOLD - 1)
137+
assert.strictEqual(locals.arrOfPrimitives.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
138+
assert.strictEqual(locals.arrOfObjects.notCapturedReason, 'collectionSize')
139+
assert.strictEqual(locals.arrOfObjects.size, COLLECTION_SIZE_THRESHOLD - 1)
140+
assert.strictEqual(locals.arrOfObjects.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
141+
assert.strictEqual(locals.map.notCapturedReason, 'collectionSize')
142+
assert.strictEqual(locals.map.size, COLLECTION_SIZE_THRESHOLD - 1)
143+
assert.strictEqual(locals.map.entries.length, DEFAULT_MAX_COLLECTION_SIZE)
144+
assert.strictEqual(locals.set.notCapturedReason, 'collectionSize')
145+
assert.strictEqual(locals.set.size, COLLECTION_SIZE_THRESHOLD - 1)
146+
assert.strictEqual(locals.set.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
147+
})
48148
)
49149
})
50150
})
51151
})
52152
})
53153

54-
function test ({ t, maxPausedTime, breakpointIndex, maxReferenceDepth }) {
154+
function test ({ t, maxPausedTime = 0, breakpointIndex, maxReferenceDepth }, assertFn) {
55155
const breakpoint = t.breakpoints[breakpointIndex]
56156

57157
return async function () {
58158
const payloadReceived = new Promise((/** @type {(value?: unknown) => void} */ resolve) => {
59159
t.agent.on('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
60160
const { locals } = captures.lines[breakpoint.line]
61-
assert.strictEqual(
62-
containsTimeBudget(locals),
63-
true,
64-
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
65-
)
161+
assertFn?.(locals)
66162
resolve()
67163
})
68164
})

integration-tests/debugger/target-app/snapshot-time-budget.js

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ const Fastify = require('fastify')
66

77
const fastify = Fastify({ logger: { level: 'error' } })
88

9+
const COLLECTION_SIZE_THRESHOLD = 500
10+
911
fastify.get('/deeply-nested-large-object', function handler () {
1012
// The size of `obj` generated is carefully tuned to never result in a snapshot larger than the 1MB size limit, while
1113
// still being large enough to trigger the time budget limit. However, the size of `fastify` and `request` is not
@@ -16,24 +18,31 @@ fastify.get('/deeply-nested-large-object', function handler () {
1618
return { paused: Number(diff) / 1_000_000 }
1719
})
1820

19-
fastify.get('/large-object-of-primitives', function handler () {
21+
fastify.get('/object-with-many-properties', function handler () {
2022
const obj = generateObject(0, 1_000_000) // eslint-disable-line no-unused-vars
2123
const start = process.hrtime.bigint()
22-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-object-of-primitives
24+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /object-with-many-properties
2325
return { paused: Number(diff) / 1_000_000 }
2426
})
2527

26-
fastify.get('/large-array-of-primitives', function handler () {
27-
const arr = Array.from({ length: 1_000_000 }, (_, i) => i) // eslint-disable-line no-unused-vars
28+
fastify.get('/large-collections', function handler () {
29+
const arrOfPrimitives = Array.from({ length: 1_000_000 }, (_, i) => i) // eslint-disable-line no-unused-vars
30+
const arrOfObjects = Array.from({ length: 1_000_000 }, (_, i) => ({ i }))
31+
const map = new Map(arrOfObjects.map((obj, i) => [i, obj])) // eslint-disable-line no-unused-vars
32+
const set = new Set(arrOfObjects) // eslint-disable-line no-unused-vars
2833
const start = process.hrtime.bigint()
29-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-array-of-primitives
34+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-collections
3035
return { paused: Number(diff) / 1_000_000 }
3136
})
3237

33-
fastify.get('/large-array-of-objects', function handler () {
34-
const arr = Array.from({ length: 1_000_000 }, (_, i) => ({ i })) // eslint-disable-line no-unused-vars
38+
fastify.get('/large-collections-below-size-threshold', function handler () {
39+
// eslint-disable-next-line no-unused-vars
40+
const arrOfPrimitives = Array.from({ length: COLLECTION_SIZE_THRESHOLD - 1 }, (_, i) => i)
41+
const arrOfObjects = Array.from({ length: COLLECTION_SIZE_THRESHOLD - 1 }, (_, i) => ({ i }))
42+
const map = new Map(arrOfObjects.map((obj, i) => [i, obj])) // eslint-disable-line no-unused-vars
43+
const set = new Set(arrOfObjects) // eslint-disable-line no-unused-vars
3544
const start = process.hrtime.bigint()
36-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-array-of-objects
45+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-collections-below-size-threshold
3746
return { paused: Number(diff) / 1_000_000 }
3847
})
3948

packages/dd-trace/src/debugger/devtools_client/index.js

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const session = require('./session')
66
const { getLocalStateForCallFrame } = require('./snapshot')
77
const send = require('./send')
88
const { getStackFromCallFrames } = require('./state')
9-
const { ackEmitting } = require('./status')
9+
const { ackEmitting, ackError } = require('./status')
1010
const config = require('./config')
1111
const { MAX_SNAPSHOTS_PER_SECOND_GLOBALLY } = require('./defaults')
1212
const log = require('./log')
@@ -165,16 +165,17 @@ session.on('Debugger.paused', async ({ params }) => {
165165
}
166166

167167
// TODO: Create unique states for each affected probe based on that probes unique `capture` settings (DEBUG-2863)
168-
const processLocalState = numberOfProbesWithSnapshots !== 0 && await getLocalStateForCallFrame(
169-
params.callFrames[0],
170-
{
168+
let processLocalState, fatalSnapshotError
169+
if (numberOfProbesWithSnapshots !== 0) {
170+
const opts = {
171171
maxReferenceDepth,
172172
maxCollectionSize,
173173
maxFieldCount,
174174
maxLength,
175175
deadlineNs: start + BigInt(config.dynamicInstrumentation.captureTimeoutMs ?? 10) * 1_000_000n
176176
}
177-
)
177+
;({ processLocalState, fatalSnapshotError } = await getLocalStateForCallFrame(params.callFrames[0], opts))
178+
}
178179

179180
await session.post('Debugger.resume')
180181
const diff = process.hrtime.bigint() - start // TODO: Recored as telemetry (DEBUG-2858)
@@ -215,6 +216,11 @@ session.on('Debugger.paused', async ({ params }) => {
215216

216217
if (probe.captureSnapshot) {
217218
const state = processLocalState()
219+
if (fatalSnapshotError) {
220+
// There was an error collecting the snapshot for this probe, let's not try again
221+
probe.captureSnapshot = false
222+
ackError(fatalSnapshotError, probe)
223+
}
218224
if (state instanceof Error) {
219225
snapshot.captureError = state.message
220226
} else if (state) {

packages/dd-trace/src/debugger/devtools_client/snapshot/collector.js

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ const session = require('../session')
55

66
const LEAF_SUBTYPES = new Set(['date', 'regexp'])
77
const ITERABLE_SUBTYPES = new Set(['map', 'set', 'weakmap', 'weakset'])
8+
const SIZE_IN_DESCRIPTION_SUBTYPES = new Set(['array', 'typedarray', 'arraybuffer', 'dataview', 'map', 'set'])
9+
const COLLECTION_SIZE_THRESHOLD = 500
810

911
module.exports = {
1012
getRuntimeObject: getObject
@@ -50,6 +52,12 @@ async function getObject (objectId, opts, depth = 0, collection = false) {
5052
} else if (result.length > opts.maxFieldCount) {
5153
// Trim the number of properties on the object if there's too many.
5254
const size = result.length
55+
if (size > COLLECTION_SIZE_THRESHOLD) {
56+
opts.fatalSnapshotError = new Error(
57+
`An object with more than ${COLLECTION_SIZE_THRESHOLD} properties was detected while collecting a snapshot. ` +
58+
'Future snapshots for exising probes in this location will be skipped until the Node.js process is restarted'
59+
)
60+
}
5361
result.length = opts.maxFieldCount
5462
result[fieldCountSym] = size
5563
} else if (privateProperties) {
@@ -69,10 +77,17 @@ async function traverseGetPropertiesResult (props, opts, depth) {
6977

7078
for (const prop of props) {
7179
if (prop.value === undefined) continue
72-
const { value: { type, objectId, subtype } } = prop
80+
const { value: { type, objectId, subtype, description } } = prop
7381
if (type === 'object') {
7482
if (objectId === undefined) continue // if `subtype` is "null"
7583
if (LEAF_SUBTYPES.has(subtype)) continue // don't waste time with these subtypes
84+
const size = parseLengthFromDescription(description, subtype)
85+
if (size !== null && size >= COLLECTION_SIZE_THRESHOLD) {
86+
const empty = []
87+
empty[collectionSizeSym] = size
88+
prop.value.properties = empty
89+
continue
90+
}
7691
work.push([
7792
prop.value,
7893
() => getObjectProperties(subtype, objectId, opts, depth).then((properties) => {
@@ -234,6 +249,26 @@ function removeNonEnumerableProperties (props) {
234249
}
235250
}
236251

252+
function parseLengthFromDescription (description, subtype) {
253+
if (typeof description !== 'string') return null
254+
if (!SIZE_IN_DESCRIPTION_SUBTYPES.has(subtype)) return null
255+
256+
const open = description.lastIndexOf('(')
257+
if (open === -1) return null
258+
259+
const close = description.indexOf(')', open + 1)
260+
if (close === -1) return null
261+
262+
const s = description.slice(open + 1, close)
263+
if (s === '') return null
264+
265+
const n = Number(s)
266+
if (!Number.isSafeInteger(n) || n < 0) return null
267+
if (String(n) !== s) return null
268+
269+
return n
270+
}
271+
237272
function overBudget (opts) {
238273
if (opts.deadlineReached) return true
239274
opts.deadlineReached = process.hrtime.bigint() >= opts.deadlineNs

packages/dd-trace/src/debugger/devtools_client/snapshot/index.js

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,11 @@ async function getLocalStateForCallFrame (callFrame, opts = {}) {
6565
}
6666

6767
// Delay calling `processRawState` so the caller gets a chance to resume the main thread before processing `rawState`
68-
return () => {
69-
processedState = processedState ?? processRawState(rawState, opts.maxLength)
70-
return processedState
68+
return {
69+
processLocalState () {
70+
processedState = processedState ?? processRawState(rawState, opts.maxLength)
71+
return processedState
72+
},
73+
fatalSnapshotError: opts.fatalSnapshotError
7174
}
7275
}

packages/dd-trace/test/debugger/devtools_client/snapshot/complex-types.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ describe('debugger -> devtools client -> snapshot.getLocalStateForCallFrame', fu
3535
resolve((await getLocalStateForCallFrame(
3636
params.callFrames[0],
3737
{ maxFieldCount: Number.MAX_SAFE_INTEGER })
38-
)())
38+
).processLocalState())
3939
})
4040

4141
await setAndTriggerBreakpoint(target, 10)

0 commit comments

Comments
 (0)