Skip to content

Commit d9d130b

Browse files
authored
Merge pull request #17 from ktuvw/latency_print
Added prints to show additional token latency information
2 parents b3fff05 + 6da3fd6 commit d9d130b

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

scripts/inference.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,13 @@ def infer(use_cache, do_sample, warmup):
597597
if args.timing == "e2e":
598598
dprint(f"E2E timing information: {timings[0]:.3f}s")
599599
elif args.timing == "per-token":
600+
if not warmup:
601+
dprint(f"First-token latency: {timings[0]*1000:.3f} ms")
602+
dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms")
603+
dprint(f"Average next-token latency (including first token): {np.mean(timings)*1000:.3f} ms")
604+
dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})")
605+
dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})")
606+
dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms")
600607
timings = [f"{t*1000:.3f}" for t in timings]
601608
dprint(f"Per-token timing information: {', '.join(timings)} ms")
602609
if len(result.shape) == 1:

0 commit comments

Comments
 (0)