diff --git a/pipeline/_internal.py b/pipeline/_internal.py index f643312..58c3c17 100644 --- a/pipeline/_internal.py +++ b/pipeline/_internal.py @@ -1,25 +1,29 @@ from dataclasses import dataclass import polars as pl +import dataframely as dy +from .schema.report import ( + AverageCarVolumeSchema, + PopularModelsSchema, + SafestModelsSchema, +) @dataclass class Report: - popularity: pl.DataFrame | pl.LazyFrame - safety: pl.DataFrame | pl.LazyFrame - volume: pl.DataFrame | pl.LazyFrame + popularity: dy.LazyFrame[PopularModelsSchema] + safety: dy.LazyFrame[SafestModelsSchema] + volume: dy.LazyFrame[AverageCarVolumeSchema] def to_string(self) -> str: """ Create a pretty-printable representation of this report. """ - # Enforce laziness and collection here to ensure we are - # df_popularity, df_volume, df_safety = pl.collect_all( [ - self.popularity.lazy(), - self.volume.lazy().sort("age_of_car"), - self.safety.lazy(), + self.popularity, + self.volume, + self.safety, ] ) header = [ diff --git a/pipeline/data.py b/pipeline/data.py index 9114d6c..34aa36b 100644 --- a/pipeline/data.py +++ b/pipeline/data.py @@ -1,15 +1,17 @@ from dataclasses import dataclass +import dataframely as dy -import polars as pl +from .schema.preprocessed import PrepPoliciesSchema, PrepModelsSchema +from .schema.raw import RawModelsSchema, RawPoliciesSchema @dataclass -class RawData[T: (pl.DataFrame | pl.LazyFrame)]: - models: T - policies: T +class RawData: + models: dy.LazyFrame[RawModelsSchema] + policies: dy.LazyFrame[RawPoliciesSchema] @dataclass -class PreprocessedData[T: (pl.DataFrame | pl.LazyFrame)]: - models: T - policies: T +class PreprocessedData: + models: dy.LazyFrame[PrepModelsSchema] + policies: dy.LazyFrame[PrepPoliciesSchema] diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index 042a15d..b3814ab 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -1,6 +1,8 @@ import polars as pl - +import dataframely as dy from .data import PreprocessedData, RawData +from .schema.preprocessed import PrepModelsSchema, PrepPoliciesSchema +from .schema.raw import RawModelsSchema, RawPoliciesSchema def preprocess(raw: RawData) -> PreprocessedData: @@ -10,62 +12,48 @@ def preprocess(raw: RawData) -> PreprocessedData: ) -def preprocess_policies[T: (pl.DataFrame, pl.LazyFrame)](policies: T) -> T: +def preprocess_policies( + policies: dy.LazyFrame[RawPoliciesSchema], +) -> dy.LazyFrame[PrepPoliciesSchema]: """Transform the raw policies for optimal representation.""" return policies.with_columns( - # Categorical columns - pl.col("model").cast(pl.Categorical), - pl.col("area_cluster").cast(pl.Categorical), - # Float columns often do not need full 64-bit precision - # This depends on the domain we are working on - pl.col("policy_tenure").cast(pl.Float32), - pl.col("age_of_car").cast(pl.Float32), - pl.col("age_of_policyholder").cast(pl.Float32), - pl.col("population_density").cast(pl.Float32), # Normalize ID - pl.col("policy_id").str.strip_prefix("policy").cast(pl.UInt64), - ) + pl.col("policy_id").str.strip_prefix("policy"), + ).pipe(PrepPoliciesSchema.validate, cast=True, eager=False) -def preprocess_models[T: (pl.DataFrame, pl.LazyFrame)](models: T) -> T: +def preprocess_models( + models: dy.LazyFrame[RawModelsSchema], +) -> dy.LazyFrame[PrepModelsSchema]: """Transform the raw models for optimal representation.""" + # Unique to drop duplicate rows we found while investigating primary key failures + df = models.unique() + # 1. Convert semantically boolean columns from pl.String to pl.Boolean - df = models.with_columns(pl.col("^is_.*$") == "Yes") + df = df.with_columns(pl.col("^is_.*$") == "Yes") # 2. Split max torque and power into components torque_parts = pl.col("max_torque").str.split("@") df = df.with_columns( - max_torque_nm=torque_parts.list[0].str.strip_suffix("Nm").cast(pl.Float32), - max_torque_rpm=torque_parts.list[1].str.strip_suffix("rpm").cast(pl.UInt16), + max_torque_nm=torque_parts.list[0].str.strip_suffix("Nm"), + max_torque_rpm=torque_parts.list[1].str.strip_suffix("rpm"), ) power_parts = pl.col("max_power").str.split("@") df = df.with_columns( - max_power_bhp=power_parts.list[0].str.strip_suffix("bhp").cast(pl.Float16), - max_power_rpm=power_parts.list[1].str.strip_suffix("rpm").cast(pl.UInt16), + max_power_bhp=power_parts.list[0].str.strip_suffix("bhp"), + max_power_rpm=power_parts.list[1].str.strip_suffix("rpm"), ) - # Step 3: Use efficient data types + # Step 4: Ensure that length / width / height are in millimeters, not centimeters + def _ensure_mm(col: pl.Expr): + return pl.when(col < 1_000).then(col * 10).otherwise(col) + df = df.with_columns( - # Some of the categorical columns are easily enumerated - pl.col("steering_type").cast(pl.Enum(["Electric", "Manual", "Power"])), - pl.col("fuel_type").cast(pl.Enum(["CNG", "Diesel", "Petrol"])), - pl.col("rear_brakes_type").cast(pl.Enum(["Drum", "Disc"])), - # For other categoricals, we may not be sure yet that we have seen all values - # so we do not want to commit to an Enum, yet - pl.col("engine_type").cast(pl.Categorical), - pl.col("model").cast(pl.Categorical), - pl.col("segment").cast(pl.Categorical), - # Value-based dtypes - pl.col("width").cast(pl.UInt16), - pl.col("height").cast(pl.UInt16), - pl.col("length").cast(pl.UInt16), - pl.col("displacement").cast(pl.UInt16), - pl.col("cylinder").cast(pl.UInt8), - pl.col("gross_weight").cast(pl.UInt16), - pl.col("gear_box").cast(pl.UInt8), - pl.col("airbags").cast(pl.UInt8), + _ensure_mm(pl.col("length")), + _ensure_mm(pl.col("width")), + _ensure_mm(pl.col("height")), ) - return df + return df.pipe(PrepModelsSchema.validate, cast=True, eager=False) diff --git a/pipeline/report.py b/pipeline/report.py index 1161270..d36173b 100644 --- a/pipeline/report.py +++ b/pipeline/report.py @@ -1,7 +1,14 @@ import polars as pl +import dataframely as dy from ._internal import Report from .data import PreprocessedData +from .schema.preprocessed import PrepModelsSchema, PrepPoliciesSchema +from .schema.report import ( + AverageCarVolumeSchema, + PopularModelsSchema, + SafestModelsSchema, +) def build_report(prep: PreprocessedData) -> Report: @@ -12,9 +19,9 @@ def build_report(prep: PreprocessedData) -> Report: ) -def find_three_most_popular_make_and_models[T: (pl.DataFrame, pl.LazyFrame)]( - models: T, policies: T -) -> T: +def find_three_most_popular_make_and_models( + models: dy.LazyFrame[PrepModelsSchema], policies: dy.LazyFrame[PrepPoliciesSchema] +) -> dy.LazyFrame[PopularModelsSchema]: """Among all policies, compute the three make/model combinations that appears most often. Returns: @@ -26,10 +33,13 @@ def find_three_most_popular_make_and_models[T: (pl.DataFrame, pl.LazyFrame)]( .agg(count=pl.len()) .sort("count", descending=True) .head(3) + .pipe(PopularModelsSchema.validate, cast=True, eager=False) ) -def find_safest_models[T: (pl.DataFrame, pl.LazyFrame)](models: T) -> T: +def find_safest_models( + models: dy.LazyFrame[PrepModelsSchema], +) -> dy.LazyFrame[SafestModelsSchema]: """Among all models, find the safest ones as measured by the number of safety features. Returns: @@ -41,12 +51,13 @@ def find_safest_models[T: (pl.DataFrame, pl.LazyFrame)](models: T) -> T: ) .sort("safety_score", descending=True) .head(5) + .pipe(SafestModelsSchema.validate, cast=True, eager=False) ) -def find_average_car_volume_by_age[T: (pl.DataFrame, pl.LazyFrame)]( - models: T, policies: T -) -> T: +def find_average_car_volume_by_age( + models: dy.LazyFrame[PrepModelsSchema], policies: dy.LazyFrame[PrepPoliciesSchema] +) -> dy.LazyFrame[AverageCarVolumeSchema]: """Among all policies, find the mean physical car volume in 10-year blocks of car age. This method should compute the volume of a car if interpreted as cuboid (i.e. box-shaped). @@ -68,4 +79,6 @@ def find_average_car_volume_by_age[T: (pl.DataFrame, pl.LazyFrame)]( - 1 ) ) + .sort("age_of_car") + .pipe(AverageCarVolumeSchema.validate, cast=True, eager=False) ) diff --git a/pipeline/schema/preprocessed.py b/pipeline/schema/preprocessed.py index e3a5058..9685b6e 100644 --- a/pipeline/schema/preprocessed.py +++ b/pipeline/schema/preprocessed.py @@ -1,4 +1,5 @@ import dataframely as dy +import polars as pl class PrepPoliciesSchema(dy.Schema): @@ -51,3 +52,12 @@ class PrepModelsSchema(dy.Schema): max_torque_rpm = dy.UInt16() max_power_bhp = dy.Float32() max_power_rpm = dy.UInt16() + + @dy.rule() + def volume_is_realistic(cls) -> pl.Expr: + """Only allow reasonably sized cars""" + volume = cls.length.col.cast(pl.UInt64) * cls.width.col * cls.height.col + + # Lengths are in millimeters and 1e9 mm^3 is 1 cubic meter + cubic_meter = 1e9 + return volume.is_between(1 * cubic_meter, 20 * cubic_meter) diff --git a/pipeline/schema/raw.py b/pipeline/schema/raw.py index 56a3d9c..a83d089 100644 --- a/pipeline/schema/raw.py +++ b/pipeline/schema/raw.py @@ -16,7 +16,7 @@ class RawPoliciesSchema(dy.Schema): class RawModelsSchema(dy.Schema): """Schema for the raw models table as provided by our data source""" - model = dy.String(primary_key=True) + model = dy.String() segment = dy.String() fuel_type = dy.String() airbags = dy.Int64() diff --git a/pipeline/schema/report.py b/pipeline/schema/report.py index 5adec38..8f54cfc 100644 --- a/pipeline/schema/report.py +++ b/pipeline/schema/report.py @@ -2,15 +2,18 @@ class PopularModelsSchema(dy.Schema): - # TODO: Fill out - ... + make = dy.String() + model = dy.String(primary_key=True) + count = dy.UInt32() class SafestModelsSchema(dy.Schema): - # TODO: Fill out - ... + model = dy.Categorical(primary_key=True) + segment = dy.Categorical() + safety_score = dy.UInt16() class AverageCarVolumeSchema(dy.Schema): - # TODO: Fill out - ... + age_of_car = dy.String(primary_key=True) + volume = dy.Float32() + change = dy.Float32(nullable=True) diff --git a/tests/test_report.py b/tests/test_report.py index d8b7623..87645ee 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -13,18 +13,22 @@ def test_find_average_car_volume_by_age(): {"model": "M2", "height": 2_000, "width": 2_000, "length": 2_000}, ] ) - # TODO: Use `.sample` to create a policies dataframe with two policies: - # One with model "M1" and car age 4.5, - # One with model "M2" and car age 14.5 - policies = PrepPoliciesSchema.sample(...) + policies = PrepPoliciesSchema.sample( + overrides=[ + {"model": "M1", "age_of_car": 4.5}, + {"model": "M2", "age_of_car": 14.5}, + ] + ) volume_m1 = 1e-9 * 1_500 * 2_000 * 2_500 volume_m2 = 1e-9 * 2_000 * 2_000 * 2_000 change = 100 * (volume_m2 / volume_m1 - 1) expected = AverageCarVolumeSchema.validate( - # TODO: Add the second, missing row for the expected dataframe pl.DataFrame( - [{"age_of_car": "(-inf, 10]", "volume": volume_m1, "change": None}, ...] + [ + {"age_of_car": "(-inf, 10]", "volume": volume_m1, "change": None}, + {"age_of_car": "(10, 20]", "volume": volume_m2, "change": change}, + ] ), cast=True, ).lazy() diff --git a/tutorial.ipynb b/tutorial.ipynb index 5c3fe93..4bf2dc0 100644 --- a/tutorial.ipynb +++ b/tutorial.ipynb @@ -421,23 +421,35 @@ "outputs": [], "source": [ "from pipeline.data import RawData\n", - "from pipeline.preprocess import preprocess" + "from pipeline.preprocess import preprocess\n", + "from pipeline.schema.raw import RawModelsSchema, RawPoliciesSchema" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 27, "id": "d29cb3ab", "metadata": {}, "outputs": [], "source": [ - "raw = RawData(models, policies)\n", + "raw = RawData(\n", + " models=RawModelsSchema.validate(models, cast=True).lazy(),\n", + " policies=RawPoliciesSchema.validate(policies, cast=True).lazy()\n", + ")\n", "preprocessed = preprocess(raw)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, + "id": "05b24a9d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, "id": "6f3f01b1", "metadata": {}, "outputs": [], @@ -451,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 29, "id": "bf070282", "metadata": {}, "outputs": [ @@ -466,15 +478,15 @@ "source": [ "print(\n", " \"Policies: \"\n", - " f\"{raw.policies.estimated_size('mb'):.2f} MB\",\n", + " f\"{raw.policies.collect().estimated_size('mb'):.2f} MB\",\n", " \"->\",\n", - " f\"{preprocessed.policies.estimated_size('mb'):.2f} MB\",\n", + " f\"{preprocessed.policies.collect().estimated_size('mb'):.2f} MB\",\n", ")" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 30, "id": "59e7a957", "metadata": {}, "outputs": [ @@ -482,22 +494,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Models: 0.26 MB -> 0.14 MB\n" + "Models: 0.25 MB -> 0.08 MB\n" ] } ], "source": [ "print(\n", " \"Models: \"\n", - " f\"{raw.models.estimated_size('mb'):.2f} MB\",\n", + " f\"{raw.models.collect().estimated_size('mb'):.2f} MB\",\n", " \"->\",\n", - " f\"{preprocessed.models.estimated_size('mb'):.2f} MB\",\n", + " f\"{preprocessed.models.collect().estimated_size('mb'):.2f} MB\",\n", ")" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 47, "id": "f642d158", "metadata": {}, "outputs": [ @@ -526,7 +538,7 @@ "└─────────┴──────────┘" ] }, - "execution_count": 21, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -536,12 +548,12 @@ "raw.models.select(\n", " logical = pl.col(\"fuel_type\"),\n", " physical = pl.col(\"fuel_type\").to_physical()\n", - ").head(3)" + ").head(3).collect()" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 49, "id": "cc991ed6", "metadata": {}, "outputs": [ @@ -555,10 +567,10 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
logicalphysical
enumu8
"Diesel"1
"CNG"0
"Petrol"2
" + "shape: (10, 2)
logicalphysical
enumu8
"Diesel"1
"CNG"0
"Diesel"1
"Diesel"1
"Petrol"2
"Petrol"2
"Diesel"1
"CNG"0
"Diesel"1
"CNG"0
" ], "text/plain": [ - "shape: (3, 2)\n", + "shape: (10, 2)\n", "┌─────────┬──────────┐\n", "│ logical ┆ physical │\n", "│ --- ┆ --- │\n", @@ -566,11 +578,18 @@ "╞═════════╪══════════╡\n", "│ Diesel ┆ 1 │\n", "│ CNG ┆ 0 │\n", + "│ Diesel ┆ 1 │\n", + "│ Diesel ┆ 1 │\n", + "│ Petrol ┆ 2 │\n", "│ Petrol ┆ 2 │\n", + "│ Diesel ┆ 1 │\n", + "│ CNG ┆ 0 │\n", + "│ Diesel ┆ 1 │\n", + "│ CNG ┆ 0 │\n", "└─────────┴──────────┘" ] }, - "execution_count": 22, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -582,7 +601,7 @@ "preprocessed.models.select(\n", " logical = pl.col(\"fuel_type\"),\n", " physical = pl.col(\"fuel_type\").to_physical()\n", - ").head(3)" + ").head(10).collect()" ] }, { @@ -595,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 33, "id": "db661295", "metadata": {}, "outputs": [], @@ -605,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 34, "id": "e4f6939d", "metadata": {}, "outputs": [], @@ -619,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 35, "id": "15183652", "metadata": {}, "outputs": [ @@ -638,7 +657,7 @@ "┌──────┬───────┬────────┐\n", "│ make ┆ model ┆ count │\n", "│ --- ┆ --- ┆ --- │\n", - "│ u64 ┆ cat ┆ u32 │\n", + "│ str ┆ str ┆ u32 │\n", "╞══════╪═══════╪════════╡\n", "│ 10 ┆ M868 ┆ 263656 │\n", "│ 6 ┆ M652 ┆ 131735 │\n", @@ -650,13 +669,13 @@ "┌───────┬─────────┬──────────────┐\n", "│ model ┆ segment ┆ safety_score │\n", "│ --- ┆ --- ┆ --- │\n", - "│ cat ┆ cat ┆ u32 │\n", + "│ str ┆ cat ┆ u16 │\n", "╞═══════╪═════════╪══════════════╡\n", - "│ M282 ┆ A ┆ 15 │\n", "│ M626 ┆ C1 ┆ 15 │\n", + "│ M282 ┆ A ┆ 15 │\n", + "│ M301 ┆ Utility ┆ 15 │\n", "│ M919 ┆ A ┆ 15 │\n", "│ M998 ┆ Utility ┆ 15 │\n", - "│ M301 ┆ Utility ┆ 15 │\n", "└───────┴─────────┴──────────────┘\n", "\n", "\n", @@ -666,13 +685,13 @@ "┌────────────┬──────────┬────────────┐\n", "│ age_of_car ┆ volume ┆ change │\n", "│ --- ┆ --- ┆ --- │\n", - "│ cat ┆ f64 ┆ f64 │\n", + "│ str ┆ f32 ┆ f32 │\n", "╞════════════╪══════════╪════════════╡\n", - "│ (-inf, 10] ┆ 8.854143 ┆ null │\n", - "│ (10, 20] ┆ 1.411613 ┆ -84.057029 │\n", - "│ (20, 30] ┆ 0.007873 ┆ -99.442247 │\n", - "│ (30, 40] ┆ 0.00739 ┆ -6.137952 │\n", - "│ (40, 50] ┆ 0.007501 ┆ 1.503317 │\n", + "│ (-inf, 10] ┆ 9.980824 ┆ null │\n", + "│ (10, 20] ┆ 8.299253 ┆ -16.848015 │\n", + "│ (20, 30] ┆ 7.874096 ┆ -5.122838 │\n", + "│ (30, 40] ┆ 7.391098 ┆ -6.134008 │\n", + "│ (40, 50] ┆ 7.501146 ┆ 1.488916 │\n", "└────────────┴──────────┴────────────┘\n", "\n", "============================================================\n" @@ -693,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 36, "id": "5be4e3ba", "metadata": {}, "outputs": [], @@ -708,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 37, "id": "8f301f7b", "metadata": {}, "outputs": [], @@ -718,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 38, "id": "fd2ff44c", "metadata": {}, "outputs": [], @@ -731,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 39, "id": "8206dc4b", "metadata": {}, "outputs": [], @@ -742,148 +761,328 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "id": "755fea10", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ - "\n", - "\n", + "\n", + "\n", "polars_query\n", - "\n", + "\n", "\n", "\n", "p2\n", - "\n", - "SORT BY [col(\"count\")]\n", + "\n", + "FILTER BY col(\"primary_key\")./Users/quantco/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/_native.abi3.so:all_rules_required([col(\"make|nullability\"), col(\"model|nullability\"), col(\"count|nullability\")])\n", "\n", "\n", "\n", "p1\n", - "\n", - "SLICE offset: 0; len: 3\n", + "\n", + "π 3/3\n", "\n", "\n", "\n", "p2->p1\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p3\n", - "\n", - "AGG [len().alias(\"count\")]\n", - "BY\n", - "[col(\"make\"), col(\"model\")]\n", + "\n", + "WITH COLUMNS [col(\"model\").as_struct().is_duplicated().not().fill_null([true]).alias(\"primary_key\"), col(\"make\").is_not_null().fill_null([true]).alias(\"make|nullability\"), col(\"model\").is_not_null().fill_null([true]).alias(\"model|nullability\"), col(\"count\").is_not_null().fill_null([true]).alias(\"count|nullability\")]\n", "\n", "\n", "\n", "p3->p2\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p4\n", - "\n", - "JOIN INNER\n", - "left: [col(\"model\")];\n", - "right: [col(\"model\")]\n", + "\n", + "π 3/3\n", "\n", "\n", "\n", "p4->p3\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p5\n", - "\n", - "WITH COLUMNS [col(\"model\").strict_cast(Categorical), col(\"area_cluster\").strict_cast(Categorical), col(\"policy_tenure\").cast(Float32), col(\"age_of_car\").cast(Float32), col(\"age_of_policyholder\").cast(Float32), col(\"population_density\").strict_cast(Float32), col(\"policy_id\").str.strip_prefix([\"policy\"]).strict_cast(UInt64)]\n", + "\n", + "SLICE offset: 0; len: 3\n", "\n", "\n", "\n", "p5->p4\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p6\n", - "\n", - "TABLE\n", - "π */7\n", + "\n", + "SORT BY [col(\"count\")]\n", "\n", "\n", "\n", "p6->p5\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p7\n", - "\n", - "WITH COLUMNS [col(\"steering_type\").strict_cast(Enum([...])), col(\"fuel_type\").strict_cast(Enum([...])), col(\"rear_brakes_type\").strict_cast(Enum([...])), col(\"engine_type\").strict_cast(Categorical), col(\"model\").strict_cast(Categorical), col(\"segment\").strict_cast(Categorical), col(\"width\").strict_cast(UInt16), col(\"height\").strict_cast(UInt16), col(\"length\").strict_cast(UInt16), col(\"displacement\").strict_cast(UInt16), col(\"cylinder\").strict_cast(UInt8), col(\"gross_weight\").strict_cast(UInt16), col(\"gear_box\").strict_cast(UInt8), col(\"airbags\").strict_cast(UInt8)]\n", + "\n", + "AGG [len().alias(\"count\")]\n", + "BY\n", + "[col(\"make\"), col(\"model\")]\n", "\n", - "\n", + "\n", "\n", - "p7->p4\n", - "\n", - "\n", + "p7->p6\n", + "\n", + "\n", "\n", "\n", "\n", "p8\n", - "\n", - "WITH COLUMNS [col(\"max_power\").str.split([\"@\"]).list.get([dyn int: 0]).str.strip_suffix([\"bhp\"]).strict_cast(Float16).alias(\"max_power_bhp\"), col(\"max_power\").str.split([\"@\"]).list.get([dyn int: 1]).str.strip_suffix([\"rpm\"]).strict_cast(UInt16).alias(\"max_power_rpm\")]\n", + "\n", + "JOIN INNER\n", + "left: [col(\"model\")];\n", + "right: [col(\"model\")]\n", "\n", "\n", "\n", "p8->p7\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p9\n", - "\n", - "WITH COLUMNS [col(\"max_torque\").str.split([\"@\"]).list.get([dyn int: 0]).str.strip_suffix([\"Nm\"]).strict_cast(Float32).alias(\"max_torque_nm\"), col(\"max_torque\").str.split([\"@\"]).list.get([dyn int: 1]).str.strip_suffix([\"rpm\"]).strict_cast(UInt16).alias(\"max_torque_rpm\")]\n", + "\n", + "π 7/7\n", "\n", "\n", "\n", "p9->p8\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p10\n", - "\n", - "WITH COLUMNS [[(col(\"is_esc\")) == (\"Yes\")], [(col(\"is_adjustable_steering\")) == (\"Yes\")], [(col(\"is_tpms\")) == (\"Yes\")], [(col(\"is_parking_sensors\")) == (\"Yes\")], [(col(\"is_parking_camera\")) == (\"Yes\")], [(col(\"is_front_fog_lights\")) == (\"Yes\")], [(col(\"is_rear_window_wiper\")) == (\"Yes\")], [(col(\"is_rear_window_washer\")) == (\"Yes\")], [(col(\"is_rear_window_defogger\")) == (\"Yes\")], [(col(\"is_brake_assist\")) == (\"Yes\")], [(col(\"is_power_door_locks\")) == (\"Yes\")], [(col(\"is_central_locking\")) == (\"Yes\")], [(col(\"is_power_steering\")) == (\"Yes\")], [(col(\"is_driver_seat_height_adjustable\")) == (\"Yes\")], [(col(\"is_day_night_rear_view_mirror\")) == (\"Yes\")], [(col(\"is_ecw\")) == (\"Yes\")], [(col(\"is_speed_alert\")) == (\"Yes\")]]\n", + "\n", + "FILTER BY col(\"primary_key\")./Users/quantco/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/_native.abi3.so:all_rules_required([col(\"policy_id|nullability\"), col(\"policy_tenure|nullability\"), col(\"policy_tenure|inf\"), col(\"policy_tenure|nan\"), col(\"age_of_car|nullability\"), col(\"age_of_car|inf\"), col(\"age_of_car|nan\"), col(\"age_of_policyholder|nullability\"), col(\"age_of_policyholder|inf\"), col(\"age_of_policyholder|nan\"), col(\"area_cluster|nullability\"), col(\"population_density|nullability\"), col(\"population_density|inf\"), col(\"population_density|nan\")])\n", "\n", "\n", "\n", "p10->p9\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p11\n", - "\n", - "TABLE\n", - "π */39\n", + "\n", + "WITH COLUMNS [col(\"policy_id\").as_struct().is_duplicated().not().fill_null([true]).alias(\"primary_key\"), col(\"policy_id\").is_not_null().fill_null([true]).alias(\"policy_id|nullability\"), col(\"policy_tenure\").is_not_null().fill_null([true]).alias(\"policy_tenure|nullability\"), col(\"policy_tenure\").is_infinite().not().fill_null([true]).alias(\"policy_tenure|inf\"), col(\"policy_tenure\").is_nan().not().fill_null([true]).alias(\"policy_tenure|nan\"), col(\"age_of_car\").is_not_null().fill_null([true]).alias(\"age_of_car|nullability\"), col(\"age_of_car\").is_infinite().not().fill_null([true]).alias(\"age_of_car|inf\"), col(\"age_of_car\").is_nan().not().fill_null([true]).alias(\"age_of_car|nan\"), col(\"age_of_policyholder\").is_not_null().fill_null([true]).alias(\"age_of_policyholder|nullability\"), col(\"age_of_policyholder\").is_infinite().not().fill_null([true]).alias(\"age_of_policyholder|inf\"), col(\"age_of_policyholder\").is_nan().not().fill_null([true]).alias(\"age_of_policyholder|nan\"), col(\"area_cluster\").is_not_null().fill_null([true]).alias(\"area_cluster|nullability\"), col(\"population_density\").is_not_null().fill_null([true]).alias(\"population_density|nullability\"), col(\"population_density\").is_infinite().not().fill_null([true]).alias(\"population_density|inf\"), col(\"population_density\").is_nan().not().fill_null([true]).alias(\"population_density|nan\")]\n", "\n", "\n", "\n", "p11->p10\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p12\n", + "\n", + "π 7/7\n", + "\n", + "\n", + "\n", + "p12->p11\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p13\n", + "\n", + "WITH COLUMNS [col(\"model\").strict_cast(Categorical), col(\"area_cluster\").strict_cast(Categorical), col(\"policy_tenure\").cast(Float32), col(\"age_of_car\").cast(Float32), col(\"age_of_policyholder\").cast(Float32), col(\"population_density\").strict_cast(Float32), col(\"policy_id\").str.strip_prefix([\"policy\"]).strict_cast(UInt64)]\n", + "\n", + "\n", + "\n", + "p13->p12\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p14\n", + "\n", + "TABLE\n", + "π */7\n", + "\n", + "\n", + "\n", + "p14->p13\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p15\n", + "\n", + "π 39/39\n", + "\n", + "\n", + "\n", + "p15->p8\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p16\n", + "\n", + "FILTER BY col(\"primary_key\")./Users/quantco/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/_native.abi3.so:all_rules_required([col(\"model|nullability\"), col(\"segment|nullability\"), col(\"fuel_type|nullability\"), col(\"airbags|nullability\"), col(\"is_esc|nullability\"), col(\"is_adjustable_steering|nullability\"), col(\"is_tpms|nullability\"), col(\"is_parking_sensors|nullability\"), col(\"is_parking_camera|nullability\"), col(\"rear_brakes_type|nullability\"), col(\"displacement|nullability\"), col(\"cylinder|nullability\"), col(\"transmission_type|nullability\"), col(\"gear_box|nullability\"), col(\"steering_type|nullability\"), col(\"turning_radius|nullability\"), col(\"turning_radius|inf\"), col(\"turning_radius|nan\"), col(\"length|nullability\"), col(\"length|min\"), col(\"width|nullability\"), col(\"width|min\"), col(\"height|nullability\"), col(\"height|min\"), col(\"gross_weight|nullability\"), col(\"is_front_fog_lights|nullability\"), col(\"is_rear_window_wiper|nullability\"), col(\"is_rear_window_washer|nullability\"), col(\"is_rear_window_defogger|nullability\"), col(\"is_brake_assist|nullability\"), col(\"is_power_door_locks|nullability\"), col(\"is_central_locking|nullability\"), col(\"is_power_steering|nullability\"), col(\"is_driver_seat_height_adjustable|nullability\"), col(\"is_day_night_rear_view_mirror|nullability\"), col(\"is_ecw|nullability\"), col(\"is_speed_alert|nullability\"), col(\"ncap_rating|nullability\"), col(\"engine_type|nullability\"), col(\"make|nullability\"), col(\"max_torque_nm|nullability\"), col(\"max_torque_nm|inf\"), col(\"max_torque_nm|nan\"), col(\"max_torque_rpm|nullability\"), col(\"max_power_bhp|nullability\"), col(\"max_power_bhp|inf\"), col(\"max_power_bhp|nan\"), col(\"max_power_rpm|nullability\")])\n", + "\n", + "\n", + "\n", + "p16->p15\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p17\n", + "\n", + "WITH COLUMNS [col(\"model\").as_struct().is_duplicated().not().fill_null([true]).alias(\"primary_key\"), col(\"model\").is_not_null().fill_null([true]).alias(\"model|nullability\"), col(\"segment\").is_not_null().fill_null([true]).alias(\"segment|nullability\"), col(\"fuel_type\").is_not_null().fill_null([true]).alias(\"fuel_type|nullability\"), col(\"airbags\").is_not_null().fill_null([true]).alias(\"airbags|nullability\"), col(\"is_esc\").is_not_null().fill_null([true]).alias(\"is_esc|nullability\"), col(\"is_adjustable_steering\").is_not_null().fill_null([true]).alias(\"is_adjustable_steering|nullability\"), col(\"is_tpms\").is_not_null().fill_null([true]).alias(\"is_tpms|nullability\"), col(\"is_parking_sensors\").is_not_null().fill_null([true]).alias(\"is_parking_sensors|nullability\"), col(\"is_parking_camera\").is_not_null().fill_null([true]).alias(\"is_parking_camera|nullability\"), col(\"rear_brakes_type\").is_not_null().fill_null([true]).alias(\"rear_brakes_type|nullability\"), col(\"displacement\").is_not_null().fill_null([true]).alias(\"displacement|nullability\"), col(\"cylinder\").is_not_null().fill_null([true]).alias(\"cylinder|nullability\"), col(\"transmission_type\").is_not_null().fill_null([true]).alias(\"transmission_type|nullability\"), col(\"gear_box\").is_not_null().fill_null([true]).alias(\"gear_box|nullability\"), col(\"steering_type\").is_not_null().fill_null([true]).alias(\"steering_type|nullability\"), col(\"turning_radius\").is_not_null().fill_null([true]).alias(\"turning_radius|nullability\"), col(\"turning_radius\").is_infinite().not().fill_null([true]).alias(\"turning_radius|inf\"), col(\"turning_radius\").is_nan().not().fill_null([true]).alias(\"turning_radius|nan\"), col(\"length\").is_not_null().fill_null([true]).alias(\"length|nullability\"), [(col(\"length\")) >= (1000)].fill_null([true]).alias(\"length|min\"), col(\"width\").is_not_null().fill_null([true]).alias(\"width|nullability\"), [(col(\"width\")) >= (1000)].fill_null([true]).alias(\"width|min\"), col(\"height\").is_not_null().fill_null([true]).alias(\"height|nullability\"), [(col(\"height\")) >= (1000)].fill_null([true]).alias(\"height|min\"), col(\"gross_weight\").is_not_null().fill_null([true]).alias(\"gross_weight|nullability\"), col(\"is_front_fog_lights\").is_not_null().fill_null([true]).alias(\"is_front_fog_lights|nullability\"), col(\"is_rear_window_wiper\").is_not_null().fill_null([true]).alias(\"is_rear_window_wiper|nullability\"), col(\"is_rear_window_washer\").is_not_null().fill_null([true]).alias(\"is_rear_window_washer|nullability\"), col(\"is_rear_window_defogger\").is_not_null().fill_null([true]).alias(\"is_rear_window_defogger|nullability\"), col(\"is_brake_assist\").is_not_null().fill_null([true]).alias(\"is_brake_assist|nullability\"), col(\"is_power_door_locks\").is_not_null().fill_null([true]).alias(\"is_power_door_locks|nullability\"), col(\"is_central_locking\").is_not_null().fill_null([true]).alias(\"is_central_locking|nullability\"), col(\"is_power_steering\").is_not_null().fill_null([true]).alias(\"is_power_steering|nullability\"), col(\"is_driver_seat_height_adjustable\").is_not_null().fill_null([true]).alias(\"is_driver_seat_height_adjustable|nullability\"), col(\"is_day_night_rear_view_mirror\").is_not_null().fill_null([true]).alias(\"is_day_night_rear_view_mirror|nullability\"), col(\"is_ecw\").is_not_null().fill_null([true]).alias(\"is_ecw|nullability\"), col(\"is_speed_alert\").is_not_null().fill_null([true]).alias(\"is_speed_alert|nullability\"), col(\"ncap_rating\").is_not_null().fill_null([true]).alias(\"ncap_rating|nullability\"), col(\"engine_type\").is_not_null().fill_null([true]).alias(\"engine_type|nullability\"), col(\"make\").is_not_null().fill_null([true]).alias(\"make|nullability\"), col(\"max_torque_nm\").is_not_null().fill_null([true]).alias(\"max_torque_nm|nullability\"), col(\"max_torque_nm\").is_infinite().not().fill_null([true]).alias(\"max_torque_nm|inf\"), col(\"max_torque_nm\").is_nan().not().fill_null([true]).alias(\"max_torque_nm|nan\"), col(\"max_torque_rpm\").is_not_null().fill_null([true]).alias(\"max_torque_rpm|nullability\"), col(\"max_power_bhp\").is_not_null().fill_null([true]).alias(\"max_power_bhp|nullability\"), col(\"max_power_bhp\").is_infinite().not().fill_null([true]).alias(\"max_power_bhp|inf\"), col(\"max_power_bhp\").is_nan().not().fill_null([true]).alias(\"max_power_bhp|nan\"), col(\"max_power_rpm\").is_not_null().fill_null([true]).alias(\"max_power_rpm|nullability\")]\n", + "\n", + "\n", + "\n", + "p17->p16\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p18\n", + "\n", + "π 39/39\n", + "\n", + "\n", + "\n", + "p18->p17\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p19\n", + "\n", + "DISTINCT\n", + "\n", + "\n", + "\n", + "p19->p18\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p20\n", + "\n", + "WITH COLUMNS [when([(col(\"length\")) < (1000)]).then([(col(\"length\")) * (10)]).otherwise(col(\"length\")), when([(col(\"width\")) < (1000)]).then([(col(\"width\")) * (10)]).otherwise(col(\"width\")), when([(col(\"height\")) < (1000)]).then([(col(\"height\")) * (10)]).otherwise(col(\"height\"))]\n", + "\n", + "\n", + "\n", + "p20->p19\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p21\n", + "\n", + "WITH COLUMNS [col(\"steering_type\").strict_cast(Enum([...])), col(\"fuel_type\").strict_cast(Enum([...])), col(\"rear_brakes_type\").strict_cast(Enum([...])), col(\"engine_type\").strict_cast(Categorical), col(\"model\").strict_cast(Categorical), col(\"segment\").strict_cast(Categorical), col(\"width\").strict_cast(UInt16), col(\"height\").strict_cast(UInt16), col(\"length\").strict_cast(UInt16), col(\"displacement\").strict_cast(UInt16), col(\"cylinder\").strict_cast(UInt8), col(\"gross_weight\").strict_cast(UInt16), col(\"gear_box\").strict_cast(UInt8), col(\"airbags\").strict_cast(UInt8)]\n", + "\n", + "\n", + "\n", + "p21->p20\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p22\n", + "\n", + "WITH COLUMNS [col(\"max_power\").str.split([\"@\"]).list.get([dyn int: 0]).str.strip_suffix([\"bhp\"]).strict_cast(Float16).alias(\"max_power_bhp\"), col(\"max_power\").str.split([\"@\"]).list.get([dyn int: 1]).str.strip_suffix([\"rpm\"]).strict_cast(UInt16).alias(\"max_power_rpm\")]\n", + "\n", + "\n", + "\n", + "p22->p21\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p23\n", + "\n", + "WITH COLUMNS [col(\"max_torque\").str.split([\"@\"]).list.get([dyn int: 0]).str.strip_suffix([\"Nm\"]).strict_cast(Float32).alias(\"max_torque_nm\"), col(\"max_torque\").str.split([\"@\"]).list.get([dyn int: 1]).str.strip_suffix([\"rpm\"]).strict_cast(UInt16).alias(\"max_torque_rpm\")]\n", + "\n", + "\n", + "\n", + "p23->p22\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p24\n", + "\n", + "WITH COLUMNS [[(col(\"is_esc\")) == (\"Yes\")], [(col(\"is_adjustable_steering\")) == (\"Yes\")], [(col(\"is_tpms\")) == (\"Yes\")], [(col(\"is_parking_sensors\")) == (\"Yes\")], [(col(\"is_parking_camera\")) == (\"Yes\")], [(col(\"is_front_fog_lights\")) == (\"Yes\")], [(col(\"is_rear_window_wiper\")) == (\"Yes\")], [(col(\"is_rear_window_washer\")) == (\"Yes\")], [(col(\"is_rear_window_defogger\")) == (\"Yes\")], [(col(\"is_brake_assist\")) == (\"Yes\")], [(col(\"is_power_door_locks\")) == (\"Yes\")], [(col(\"is_central_locking\")) == (\"Yes\")], [(col(\"is_power_steering\")) == (\"Yes\")], [(col(\"is_driver_seat_height_adjustable\")) == (\"Yes\")], [(col(\"is_day_night_rear_view_mirror\")) == (\"Yes\")], [(col(\"is_ecw\")) == (\"Yes\")], [(col(\"is_speed_alert\")) == (\"Yes\")]]\n", + "\n", + "\n", + "\n", + "p24->p23\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p25\n", + "\n", + "DISTINCT\n", + "\n", + "\n", + "\n", + "p25->p24\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p26\n", + "\n", + "TABLE\n", + "π */37\n", + "\n", + "\n", + "\n", + "p26->p25\n", + "\n", + "\n", "\n", "\n", "" @@ -904,125 +1103,321 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "id": "b271446b", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ - "\n", - "\n", + "\n", + "\n", "polars_query\n", - "\n", + "\n", "\n", "\n", "p2\n", - "\n", - "FILTER BY col(\"count\").dynamic_predicate()\n", + "\n", + "FILTER BY col(\"primary_key\")./Users/quantco/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/_native.abi3.so:all_rules_required([col(\"make|nullability\"), col(\"model|nullability\"), col(\"count|nullability\")])\n", "\n", "\n", "\n", "p1\n", - "\n", - "SORT BY [col(\"count\")]\n", + "\n", + "simple π 3/7\n", + "[\"make\", \"model\", \"count\"]\n", "\n", "\n", "\n", "p2->p1\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p3\n", - "\n", - "AGG [len().alias(\"count\")]\n", - "BY\n", - "[col(\"make\"), col(\"model\")]\n", + "\n", + "WITH COLUMNS [col(\"model\").as_struct().is_duplicated().not().fill_null([true]).alias(\"primary_key\"), col(\"make\").is_not_null().fill_null([true]).alias(\"make|nullability\"), col(\"model\").is_not_null().fill_null([true]).alias(\"model|nullability\"), col(\"count\").is_not_null().fill_null([true]).alias(\"count|nullability\")]\n", "\n", "\n", "\n", "p3->p2\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p4\n", - "\n", - "simple π 2/2\n", - "[\"make\", \"model\"]\n", + "\n", + "π 3/3\n", "\n", "\n", "\n", "p4->p3\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p5\n", - "\n", - "JOIN INNER\n", - "left: [col(\"model\")];\n", - "right: [col(\"model\")]\n", + "\n", + "SORT BY [col(\"count\")]\n", "\n", "\n", "\n", "p5->p4\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p6\n", - "\n", - "WITH COLUMNS [col(\"model\").strict_cast(Categorical)]\n", + "\n", + "FILTER BY col(\"count\").dynamic_predicate()\n", "\n", "\n", "\n", "p6->p5\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p7\n", - "\n", - "TABLE\n", - "π 1/7\n", + "\n", + "AGG [len().alias(\"count\")]\n", + "BY\n", + "[col(\"make\"), col(\"model\")]\n", "\n", "\n", "\n", "p7->p6\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "p8\n", - "\n", - "WITH COLUMNS [col(\"model\").strict_cast(Categorical)]\n", + "\n", + "simple π 2/2\n", + "[\"make\", \"model\"]\n", "\n", - "\n", + "\n", "\n", - "p8->p5\n", - "\n", - "\n", + "p8->p7\n", + "\n", + "\n", "\n", "\n", "\n", "p9\n", - "\n", - "TABLE\n", - "π 2/39\n", + "\n", + "JOIN INNER\n", + "left: [col(\"model\")];\n", + "right: [col(\"model\")]\n", "\n", "\n", "\n", "p9->p8\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p10\n", + "\n", + "simple π 1/22\n", + "[\"model\"]\n", + "\n", + "\n", + "\n", + "p10->p9\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p11\n", + "\n", + "FILTER BY col(\"primary_key\")./Users/quantco/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/_native.abi3.so:all_rules_required([col(\"policy_id|nullability\"), col(\"policy_tenure|nullability\"), col(\"policy_tenure|inf\"), col(\"policy_tenure|nan\"), col(\"age_of_car|nullability\"), col(\"age_of_car|inf\"), col(\"age_of_car|nan\"), col(\"age_of_policyholder|nullability\"), col(\"age_of_policyholder|inf\"), col(\"age_of_policyholder|nan\"), col(\"area_cluster|nullability\"), col(\"population_density|nullability\"), col(\"population_density|inf\"), col(\"population_density|nan\")])\n", + "\n", + "\n", + "\n", + "p11->p10\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p12\n", + "\n", + "WITH COLUMNS [col(\"policy_id\").as_struct().is_duplicated().not().fill_null([true]).alias(\"primary_key\"), col(\"policy_id\").is_not_null().fill_null([true]).alias(\"policy_id|nullability\"), col(\"policy_tenure\").is_not_null().fill_null([true]).alias(\"policy_tenure|nullability\"), col(\"policy_tenure\").is_infinite().not().fill_null([true]).alias(\"policy_tenure|inf\"), col(\"policy_tenure\").is_nan().not().fill_null([true]).alias(\"policy_tenure|nan\"), col(\"age_of_car\").is_not_null().fill_null([true]).alias(\"age_of_car|nullability\"), col(\"age_of_car\").is_infinite().not().fill_null([true]).alias(\"age_of_car|inf\"), col(\"age_of_car\").is_nan().not().fill_null([true]).alias(\"age_of_car|nan\"), col(\"age_of_policyholder\").is_not_null().fill_null([true]).alias(\"age_of_policyholder|nullability\"), col(\"age_of_policyholder\").is_infinite().not().fill_null([true]).alias(\"age_of_policyholder|inf\"), col(\"age_of_policyholder\").is_nan().not().fill_null([true]).alias(\"age_of_policyholder|nan\"), col(\"area_cluster\").is_not_null().fill_null([true]).alias(\"area_cluster|nullability\"), col(\"population_density\").is_not_null().fill_null([true]).alias(\"population_density|nullability\"), col(\"population_density\").is_infinite().not().fill_null([true]).alias(\"population_density|inf\"), col(\"population_density\").is_nan().not().fill_null([true]).alias(\"population_density|nan\")]\n", + "\n", + "\n", + "\n", + "p12->p11\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p13\n", + "\n", + "WITH COLUMNS [col(\"model\").strict_cast(Categorical), col(\"area_cluster\").strict_cast(Categorical), col(\"policy_tenure\").cast(Float32), col(\"age_of_car\").cast(Float32), col(\"age_of_policyholder\").cast(Float32), col(\"population_density\").strict_cast(Float32), col(\"policy_id\").str.strip_prefix([\"policy\"]).strict_cast(UInt64)]\n", + "\n", + "\n", + "\n", + "p13->p12\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p14\n", + "\n", + "TABLE\n", + "π */7\n", + "\n", + "\n", + "\n", + "p14->p13\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p15\n", + "\n", + "simple π 2/88\n", + "[\"model\", \"make\"]\n", + "\n", + "\n", + "\n", + "p15->p9\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p16\n", + "\n", + "FILTER BY col(\"primary_key\")./Users/quantco/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/_native.abi3.so:all_rules_required([col(\"model|nullability\"), col(\"segment|nullability\"), col(\"fuel_type|nullability\"), col(\"airbags|nullability\"), col(\"is_esc|nullability\"), col(\"is_adjustable_steering|nullability\"), col(\"is_tpms|nullability\"), col(\"is_parking_sensors|nullability\"), col(\"is_parking_camera|nullability\"), col(\"rear_brakes_type|nullability\"), col(\"displacement|nullability\"), col(\"cylinder|nullability\"), col(\"transmission_type|nullability\"), col(\"gear_box|nullability\"), col(\"steering_type|nullability\"), col(\"turning_radius|nullability\"), col(\"turning_radius|inf\"), col(\"turning_radius|nan\"), col(\"length|nullability\"), col(\"length|min\"), col(\"width|nullability\"), col(\"width|min\"), col(\"height|nullability\"), col(\"height|min\"), col(\"gross_weight|nullability\"), col(\"is_front_fog_lights|nullability\"), col(\"is_rear_window_wiper|nullability\"), col(\"is_rear_window_washer|nullability\"), col(\"is_rear_window_defogger|nullability\"), col(\"is_brake_assist|nullability\"), col(\"is_power_door_locks|nullability\"), col(\"is_central_locking|nullability\"), col(\"is_power_steering|nullability\"), col(\"is_driver_seat_height_adjustable|nullability\"), col(\"is_day_night_rear_view_mirror|nullability\"), col(\"is_ecw|nullability\"), col(\"is_speed_alert|nullability\"), col(\"ncap_rating|nullability\"), col(\"engine_type|nullability\"), col(\"make|nullability\"), col(\"max_torque_nm|nullability\"), col(\"max_torque_nm|inf\"), col(\"max_torque_nm|nan\"), col(\"max_torque_rpm|nullability\"), col(\"max_power_bhp|nullability\"), col(\"max_power_bhp|inf\"), col(\"max_power_bhp|nan\"), col(\"max_power_rpm|nullability\")])\n", + "\n", + "\n", + "\n", + "p16->p15\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p17\n", + "\n", + "WITH COLUMNS [col(\"model\").as_struct().is_duplicated().not().fill_null([true]).alias(\"primary_key\"), col(\"model\").is_not_null().fill_null([true]).alias(\"model|nullability\"), col(\"segment\").is_not_null().fill_null([true]).alias(\"segment|nullability\"), col(\"fuel_type\").is_not_null().fill_null([true]).alias(\"fuel_type|nullability\"), col(\"airbags\").is_not_null().fill_null([true]).alias(\"airbags|nullability\"), col(\"is_esc\").is_not_null().fill_null([true]).alias(\"is_esc|nullability\"), col(\"is_adjustable_steering\").is_not_null().fill_null([true]).alias(\"is_adjustable_steering|nullability\"), col(\"is_tpms\").is_not_null().fill_null([true]).alias(\"is_tpms|nullability\"), col(\"is_parking_sensors\").is_not_null().fill_null([true]).alias(\"is_parking_sensors|nullability\"), col(\"is_parking_camera\").is_not_null().fill_null([true]).alias(\"is_parking_camera|nullability\"), col(\"rear_brakes_type\").is_not_null().fill_null([true]).alias(\"rear_brakes_type|nullability\"), col(\"displacement\").is_not_null().fill_null([true]).alias(\"displacement|nullability\"), col(\"cylinder\").is_not_null().fill_null([true]).alias(\"cylinder|nullability\"), col(\"transmission_type\").is_not_null().fill_null([true]).alias(\"transmission_type|nullability\"), col(\"gear_box\").is_not_null().fill_null([true]).alias(\"gear_box|nullability\"), col(\"steering_type\").is_not_null().fill_null([true]).alias(\"steering_type|nullability\"), col(\"turning_radius\").is_not_null().fill_null([true]).alias(\"turning_radius|nullability\"), col(\"turning_radius\").is_infinite().not().fill_null([true]).alias(\"turning_radius|inf\"), col(\"turning_radius\").is_nan().not().fill_null([true]).alias(\"turning_radius|nan\"), col(\"length\").is_not_null().fill_null([true]).alias(\"length|nullability\"), [(col(\"length\")) >= (1000)].fill_null([true]).alias(\"length|min\"), col(\"width\").is_not_null().fill_null([true]).alias(\"width|nullability\"), [(col(\"width\")) >= (1000)].fill_null([true]).alias(\"width|min\"), col(\"height\").is_not_null().fill_null([true]).alias(\"height|nullability\"), [(col(\"height\")) >= (1000)].fill_null([true]).alias(\"height|min\"), col(\"gross_weight\").is_not_null().fill_null([true]).alias(\"gross_weight|nullability\"), col(\"is_front_fog_lights\").is_not_null().fill_null([true]).alias(\"is_front_fog_lights|nullability\"), col(\"is_rear_window_wiper\").is_not_null().fill_null([true]).alias(\"is_rear_window_wiper|nullability\"), col(\"is_rear_window_washer\").is_not_null().fill_null([true]).alias(\"is_rear_window_washer|nullability\"), col(\"is_rear_window_defogger\").is_not_null().fill_null([true]).alias(\"is_rear_window_defogger|nullability\"), col(\"is_brake_assist\").is_not_null().fill_null([true]).alias(\"is_brake_assist|nullability\"), col(\"is_power_door_locks\").is_not_null().fill_null([true]).alias(\"is_power_door_locks|nullability\"), col(\"is_central_locking\").is_not_null().fill_null([true]).alias(\"is_central_locking|nullability\"), col(\"is_power_steering\").is_not_null().fill_null([true]).alias(\"is_power_steering|nullability\"), col(\"is_driver_seat_height_adjustable\").is_not_null().fill_null([true]).alias(\"is_driver_seat_height_adjustable|nullability\"), col(\"is_day_night_rear_view_mirror\").is_not_null().fill_null([true]).alias(\"is_day_night_rear_view_mirror|nullability\"), col(\"is_ecw\").is_not_null().fill_null([true]).alias(\"is_ecw|nullability\"), col(\"is_speed_alert\").is_not_null().fill_null([true]).alias(\"is_speed_alert|nullability\"), col(\"ncap_rating\").is_not_null().fill_null([true]).alias(\"ncap_rating|nullability\"), col(\"engine_type\").is_not_null().fill_null([true]).alias(\"engine_type|nullability\"), col(\"make\").is_not_null().fill_null([true]).alias(\"make|nullability\"), col(\"max_torque_nm\").is_not_null().fill_null([true]).alias(\"max_torque_nm|nullability\"), col(\"max_torque_nm\").is_infinite().not().fill_null([true]).alias(\"max_torque_nm|inf\"), col(\"max_torque_nm\").is_nan().not().fill_null([true]).alias(\"max_torque_nm|nan\"), col(\"max_torque_rpm\").is_not_null().fill_null([true]).alias(\"max_torque_rpm|nullability\"), col(\"max_power_bhp\").is_not_null().fill_null([true]).alias(\"max_power_bhp|nullability\"), col(\"max_power_bhp\").is_infinite().not().fill_null([true]).alias(\"max_power_bhp|inf\"), col(\"max_power_bhp\").is_nan().not().fill_null([true]).alias(\"max_power_bhp|nan\"), col(\"max_power_rpm\").is_not_null().fill_null([true]).alias(\"max_power_rpm|nullability\")]\n", + "\n", + "\n", + "\n", + "p17->p16\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p18\n", + "\n", + "π 39/39\n", + "\n", + "\n", + "\n", + "p18->p17\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p19\n", + "\n", + "DISTINCT\n", + "\n", + "\n", + "\n", + "p19->p18\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p20\n", + "\n", + "WITH COLUMNS [when([(col(\"length\")) < (1000)]).then([(col(\"length\")) * (10)]).otherwise(col(\"length\")), when([(col(\"width\")) < (1000)]).then([(col(\"width\")) * (10)]).otherwise(col(\"width\")), when([(col(\"height\")) < (1000)]).then([(col(\"height\")) * (10)]).otherwise(col(\"height\"))]\n", + "\n", + "\n", + "\n", + "p20->p19\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p21\n", + "\n", + "simple π 41/43\n", + "[\"model\", \"segment\", ... 39 other columns]\n", + "\n", + "\n", + "\n", + "p21->p20\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p22\n", + "\n", + "WITH COLUMNS [[(col(\"is_esc\")) == (\"Yes\")], [(col(\"is_adjustable_steering\")) == (\"Yes\")], [(col(\"is_tpms\")) == (\"Yes\")], [(col(\"is_parking_sensors\")) == (\"Yes\")], [(col(\"is_parking_camera\")) == (\"Yes\")], [(col(\"is_front_fog_lights\")) == (\"Yes\")], [(col(\"is_rear_window_wiper\")) == (\"Yes\")], [(col(\"is_rear_window_washer\")) == (\"Yes\")], [(col(\"is_rear_window_defogger\")) == (\"Yes\")], [(col(\"is_brake_assist\")) == (\"Yes\")], [(col(\"is_power_door_locks\")) == (\"Yes\")], [(col(\"is_central_locking\")) == (\"Yes\")], [(col(\"is_power_steering\")) == (\"Yes\")], [(col(\"is_driver_seat_height_adjustable\")) == (\"Yes\")], [(col(\"is_day_night_rear_view_mirror\")) == (\"Yes\")], [(col(\"is_ecw\")) == (\"Yes\")], [(col(\"is_speed_alert\")) == (\"Yes\")], col(\"__POLARS_CSER_0xa9cfb20ce426e66b\").alias(\"max_torque\").list.get([dyn int: 0]).str.strip_suffix([\"Nm\"]).strict_cast(Float32).alias(\"max_torque_nm\"), col(\"__POLARS_CSER_0xa9cfb20ce426e66b\").alias(\"max_torque\").list.get([dyn int: 1]).str.strip_suffix([\"rpm\"]).strict_cast(UInt16).alias(\"max_torque_rpm\"), col(\"__POLARS_CSER_0xe7c2050176993c32\").alias(\"max_power\").list.get([dyn int: 0]).str.strip_suffix([\"bhp\"]).strict_cast(Float16).alias(\"max_power_bhp\"), col(\"__POLARS_CSER_0xe7c2050176993c32\").alias(\"max_power\").list.get([dyn int: 1]).str.strip_suffix([\"rpm\"]).strict_cast(UInt16).alias(\"max_power_rpm\"), col(\"steering_type\").strict_cast(Enum([...])), col(\"fuel_type\").strict_cast(Enum([...])), col(\"rear_brakes_type\").strict_cast(Enum([...])), col(\"engine_type\").strict_cast(Categorical), col(\"model\").strict_cast(Categorical), col(\"segment\").strict_cast(Categorical), col(\"width\").strict_cast(UInt16), col(\"height\").strict_cast(UInt16), col(\"length\").strict_cast(UInt16), col(\"displacement\").strict_cast(UInt16), col(\"cylinder\").strict_cast(UInt8), col(\"gross_weight\").strict_cast(UInt16), col(\"gear_box\").strict_cast(UInt8), col(\"airbags\").strict_cast(UInt8)]\n", + "\n", + "\n", + "\n", + "p22->p21\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p23\n", + "\n", + "WITH COLUMNS [col(\"max_power\").str.split([\"@\"]).alias(\"__POLARS_CSER_0xe7c2050176993c32\"), col(\"max_torque\").str.split([\"@\"]).alias(\"__POLARS_CSER_0xa9cfb20ce426e66b\")]\n", + "\n", + "\n", + "\n", + "p23->p22\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p24\n", + "\n", + "DISTINCT\n", + "\n", + "\n", + "\n", + "p24->p23\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "p25\n", + "\n", + "TABLE\n", + "π */37\n", + "\n", + "\n", + "\n", + "p25->p24\n", + "\n", + "\n", "\n", "\n", "" @@ -1042,7 +1437,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 42, "id": "83e52970", "metadata": {}, "outputs": [ @@ -1050,7 +1445,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "48.9 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "86.4 ms ± 2.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1063,7 +1458,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 43, "id": "b8ef4d24", "metadata": {}, "outputs": [ @@ -1071,7 +1466,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "95 ms ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "84.6 ms ± 1.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1090,7 +1485,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "3cf919df", "metadata": {}, "outputs": [], @@ -1103,6 +1498,239 @@ "# - Remember the problem so we can fix it in preprocessing in the next step." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "51db629a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6a393ab", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c162d7c5", + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "1 rules failed validation:\n - 'primary_key' failed for 20 rows", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValidationError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[51]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m pipeline.schema.raw \u001b[38;5;28;01mimport\u001b[39;00m RawModelsSchema\n\u001b[32m 2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m RawModelsSchema.validate(raw.models, cast=\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/repos/tutorial-pycon26-polars-dataframely/.pixi/envs/default/lib/python3.14/site-packages/dataframely/schema.py:573\u001b[39m, in \u001b[36mSchema.validate\u001b[39m\u001b[34m(cls, df, cast, eager)\u001b[39m\n\u001b[32m 571\u001b[39m out, failure = \u001b[38;5;28mcls\u001b[39m.filter(df, cast=cast, eager=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 572\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(failure) > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ValidationError(\n\u001b[32m 574\u001b[39m format_rule_failures(\u001b[38;5;28mlist\u001b[39m(failure.counts().items()))\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[32m 577\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[31mValidationError\u001b[39m: 1 rules failed validation:\n - 'primary_key' failed for 20 rows" + ] + } + ], + "source": [ + "from pipeline.schema.raw import RawModelsSchema, RawPoliciesSchema\n", + "\n", + "RawModelsSchema.validate(raw.models, cast=True);" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "36aec237", + "metadata": {}, + "outputs": [], + "source": [ + "good_models, failures = RawModelsSchema.filter(\n", + " models\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "478f96fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'primary_key': 20}" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "failures.counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "4497b797", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (20, 83)
modelsegmentfuel_typeairbagsis_escis_adjustable_steeringis_tpmsis_parking_sensorsis_parking_camerarear_brakes_typedisplacementcylindertransmission_typegear_boxsteering_typeturning_radiuslengthwidthheightgross_weightis_front_fog_lightsis_rear_window_wiperis_rear_window_washeris_rear_window_defoggeris_brake_assistis_power_door_locksis_central_lockingis_power_steeringis_driver_seat_height_adjustableis_day_night_rear_view_mirroris_ecwis_speed_alertncap_ratingengine_typemakemax_powermax_torqueis_parking_camera|nullabilityrear_brakes_type|nullabilitydisplacement|nullabilitycylinder|nullabilitytransmission_type|nullabilitygear_box|nullabilitysteering_type|nullabilityturning_radius|nullabilityturning_radius|infturning_radius|nanlength|nullabilitylength|inflength|nanwidth|nullabilitywidth|infwidth|nanheight|nullabilityheight|infheight|nangross_weight|nullabilityis_front_fog_lights|nullabilityis_rear_window_wiper|nullabilityis_rear_window_washer|nullabilityis_rear_window_defogger|nullabilityis_brake_assist|nullabilityis_power_door_locks|nullabilityis_central_locking|nullabilityis_power_steering|nullabilityis_driver_seat_height_adjustable|nullabilityis_day_night_rear_view_mirror|nullabilityis_ecw|nullabilityis_speed_alert|nullabilityncap_rating|nullabilityengine_type|nullabilitymake|nullabilitymax_power|nullabilitymax_torque|nullability
strstrstri64strstrstrstrstrstri64i64stri64strf64f64f64f64i64strstrstrstrstrstrstrstrstrstrstrstri64stru64strstrenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenumenum
"M1107""Utility""Petrol"1"No""Yes""No""Yes""Yes""Disc"14984"Manual"5"Electric"4.9401.2932167.6312155.6171388"No""Yes""Yes""No""Yes""Yes""Yes""Yes""No""Yes""Yes""Yes"5"2.0 L TDI CR"4"118.375bhp@5999rpm""249.2Nm@4396rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M1107""Utility""Petrol"1"No""Yes""No""Yes""Yes""Disc"14984"Manual"5"Electric"4.9401.2932167.6312155.6171388"No""Yes""Yes""No""Yes""Yes""Yes""Yes""No""Yes""Yes""Yes"5"2.0 L TDI CR"4"118.375bhp@5999rpm""249.2Nm@4396rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M254""Utility""CNG"1"No""Yes""No""Yes""No""Drum"10763"Manual"5"Power"4.56328.0476154.8038143.18661601"No""No""No""No""Yes""No""Yes""Yes""No""No""No""Yes"4"2.0 L e:HEV"0"71.8125bhp@6000rpm""90.3Nm@4332rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M254""Utility""CNG"1"No""Yes""No""Yes""No""Drum"10763"Manual"5"Power"4.56328.0476154.8038143.18661601"No""No""No""No""Yes""No""Yes""Yes""No""No""No""Yes"4"2.0 L e:HEV"0"71.8125bhp@6000rpm""90.3Nm@4332rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M324""Utility""Petrol"1"No""Yes""No""Yes""No""Drum"12273"Manual"5"Electric"4.623891.691775.071543.411460"No""No""Yes""No""Yes""Yes""Yes""Yes""No""Yes""Yes""Yes"5"1.6 L DDiS 320"11"104.5625bhp@5998rpm""155.1Nm@4394rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M937""Utility""CNG"1"No""No""No""Yes""No""Drum"10343"Manual"5"Power"4.73474.0381672.6481477.041199"No""No""No""Yes""No""No""No""Yes""Yes""Yes""Yes""Yes"2"1.2 L mStallion"0"59.84375bhp@5881rpm""85.1Nm@3501rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M946""Utility""CNG"1"No""No""No""Yes""No""Drum"10193"Manual"5"Power"4.58324.3782141.2208140.94061474"No""No""No""No""No""No""Yes""Yes""No""No""No""Yes"3"1.5 L Kryotec"2"57.28125bhp@6000rpm""82.9Nm@3797rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M946""Utility""CNG"1"No""No""No""Yes""No""Drum"10193"Manual"5"Power"4.58324.3782141.2208140.94061474"No""No""No""No""No""No""Yes""Yes""No""No""No""Yes"3"1.5 L Kryotec"2"57.28125bhp@6000rpm""82.9Nm@3797rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M964""Utility""Diesel"1"Yes""Yes""No""Yes""Yes""Drum"12524"Manual"5"Power"4.844015.2641589.9041760.7041720"No""No""No""No""No""Yes""Yes""Yes""No""No""Yes""Yes"5"1.2 L 3NR-FE"10"87.625bhp@5979rpm""144.9Nm@4152rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
"M964""Utility""Diesel"1"Yes""Yes""No""Yes""Yes""Drum"12524"Manual"5"Power"4.844015.2641589.9041760.7041720"No""No""No""No""No""Yes""Yes""Yes""No""No""Yes""Yes"5"1.2 L 3NR-FE"10"87.625bhp@5979rpm""144.9Nm@4152rpm""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid""valid"
" + ], + "text/plain": [ + "shape: (20, 83)\n", + "┌───────┬─────────┬───────────┬─────────┬───┬─────────────┬─────────────┬─────────────┬────────────┐\n", + "│ model ┆ segment ┆ fuel_type ┆ airbags ┆ … ┆ engine_type ┆ make|nullab ┆ max_power|n ┆ max_torque │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ |nullabilit ┆ ility ┆ ullability ┆ |nullabili │\n", + "│ str ┆ str ┆ str ┆ i64 ┆ ┆ y ┆ --- ┆ --- ┆ ty │\n", + "│ ┆ ┆ ┆ ┆ ┆ --- ┆ enum ┆ enum ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ enum ┆ ┆ ┆ enum │\n", + "╞═══════╪═════════╪═══════════╪═════════╪═══╪═════════════╪═════════════╪═════════════╪════════════╡\n", + "│ M1107 ┆ Utility ┆ Petrol ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M1107 ┆ Utility ┆ Petrol ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M254 ┆ Utility ┆ CNG ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M254 ┆ Utility ┆ CNG ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M324 ┆ Utility ┆ Petrol ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ M937 ┆ Utility ┆ CNG ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M946 ┆ Utility ┆ CNG ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M946 ┆ Utility ┆ CNG ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M964 ┆ Utility ┆ Diesel ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "│ M964 ┆ Utility ┆ Diesel ┆ 1 ┆ … ┆ valid ┆ valid ┆ valid ┆ valid │\n", + "└───────┴─────────┴───────────┴─────────┴───┴─────────────┴─────────────┴─────────────┴────────────┘" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "failures.details().sort(\"model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "77e533f2", + "metadata": {}, + "outputs": [], + "source": [ + "good_models, failures = RawModelsSchema.filter(\n", + " models.unique()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "c2f9f441", + "metadata": {}, + "outputs": [], + "source": [ + "RawPoliciesSchema.validate(policies, cast=True);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8c5ab2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
modellen
stru32
"M937"2
"M477"2
"M676"2
"M964"2
"M946"2
"M254"2
"M40"2
"M324"2
"M1107"2
"M910"2
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌───────┬─────┐\n", + "│ model ┆ len │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═══════╪═════╡\n", + "│ M937 ┆ 2 │\n", + "│ M477 ┆ 2 │\n", + "│ M676 ┆ 2 │\n", + "│ M964 ┆ 2 │\n", + "│ M946 ┆ 2 │\n", + "│ M254 ┆ 2 │\n", + "│ M40 ┆ 2 │\n", + "│ M324 ┆ 2 │\n", + "│ M1107 ┆ 2 │\n", + "│ M910 ┆ 2 │\n", + "└───────┴─────┘" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The raw table has a few models with two rows\n", + "raw.models.group_by(\"model\").len().filter(pl.col(\"len\") > 1).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c64af62", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (0, 2)
modellen
stru32
" + ], + "text/plain": [ + "shape: (0, 2)\n", + "┌───────┬─────┐\n", + "│ model ┆ len │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═══════╪═════╡\n", + "└───────┴─────┘" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The lines are truly duplicate, i.e. all other columns are also the same\n", + "# so a pragmatic solution is to drop them.\n", + "raw.models.unique().group_by(\"model\").len().filter(pl.col(\"len\") > 1).collect()" + ] + }, { "cell_type": "markdown", "id": "657906a5", @@ -1113,7 +1741,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "42298efc", "metadata": {}, "outputs": [], @@ -1136,7 +1764,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "79a0ab8d", "metadata": {}, "outputs": [], @@ -1156,7 +1784,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "4ec780da", "metadata": {}, "outputs": [],