From d52887038dcc3d23ffb2c0894c5679696e9bcd2a Mon Sep 17 00:00:00 2001
From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com>
Date: Wed, 29 Apr 2026 16:58:14 -0400
Subject: [PATCH] Add details on caching to skill

---
 skills/datafusion_python/SKILL.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/skills/datafusion_python/SKILL.md b/skills/datafusion_python/SKILL.md
index 7b07b430f..98fa2c7aa 100644
--- a/skills/datafusion_python/SKILL.md
+++ b/skills/datafusion_python/SKILL.md
@@ -253,6 +253,7 @@ polars_df = df.to_polars()              # pl.DataFrame
 py_dict = df.to_pydict()                # dict[str, list]
 py_list = df.to_pylist()                # list[dict]
 count = df.count()                      # int
+df = df.cache()                         # materialize in memory, return DataFrame
 ```
 
 ### Date and Timestamp Type Conversion
@@ -309,6 +310,29 @@ Async iteration is also supported via `async for batch in df: ...` (or
 `df.execute_stream()`), which is useful when batches are interleaved with
 other I/O.
 
+### Caching Intermediate Results
+
+`df.cache()` materializes a DataFrame as an in-memory table and returns a new
+DataFrame backed by it. Reach for it when the same intermediate result feeds
+multiple downstream queries — without `cache()`, each branch re-executes the
+full upstream plan (re-reading files, recomputing filters/aggregates).
+
+```python
+base = (
+    ctx.read_parquet("orders.parquet")
+    .filter(col("status") == "shipped")
+    .cache()                                # materialize once, reuse below
+)
+by_region = base.aggregate(["region"], [F.sum(col("amount")).alias("total")])
+by_customer = base.aggregate(["customer"], [F.sum(col("amount")).alias("total")])
+```
+
+Skip `cache()` for single-use DataFrames — the lazy plan is already optimal.
+
+The cached table is owned by the DataFrame returned from `cache()` (and any
+DataFrames chained from it). To free the memory, drop every reference — let
+them go out of scope, or `del base; del by_region; del by_customer`.
+
 ### Writing Results
 
 ```python