From 26b683e901660b1774f2e0a0126f4d771138456a Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 11 Feb 2026 19:50:17 +0000 Subject: [PATCH 1/4] feat: add bigquery.ai.generate_table function --- GEMINI.md | 42 +------------ bigframes/bigquery/_operations/ai.py | 93 ++++++++++++++++++++++++++++ bigframes/bigquery/ai.py | 2 + tests/unit/bigquery/test_ai.py | 54 +++++++++++++++- 4 files changed, 150 insertions(+), 41 deletions(-) diff --git a/GEMINI.md b/GEMINI.md index 0d447f17a4..0d74e277ad 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -2,48 +2,12 @@ ## Testing -We use `nox` to instrument our tests. +We use `pytest` to instrument our tests. -- To test your changes, run unit tests with `nox`: +- To test your changes, run unit tests with `pytest`: ```bash - nox -r -s unit - ``` - -- To run a single unit test: - - ```bash - nox -r -s unit-3.13 -- -k - ``` - -- Ignore this step if you lack access to Google Cloud resources. To run system - tests, you can execute:: - - # Run all system tests - $ nox -r -s system - - # Run a single system test - $ nox -r -s system-3.13 -- -k - -- The codebase must have better coverage than it had previously after each - change. You can test coverage via `nox -s unit system cover` (takes a long - time). Omit `system` if you lack access to cloud resources. - -## Code Style - -- We use the automatic code formatter `black`. You can run it using - the nox session `format`. This will eliminate many lint errors. Run via: - - ```bash - nox -r -s format - ``` - -- PEP8 compliance is required, with exceptions defined in the linter configuration. - If you have ``nox`` installed, you can test that you have not introduced - any non-compliant code via: - - ``` - nox -r -s lint + pytest :: ``` - When writing tests, use the idiomatic "pytest" style. diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index bc2ab8dd20..16ca989bf0 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -601,6 +601,99 @@ def generate_text( return session.read_gbq_query(query) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_table( + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], + data: Union[dataframe.DataFrame, pd.DataFrame], + *, + output_schema: str, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + max_output_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + request_type: Optional[str] = None, +) -> dataframe.DataFrame: + """ + Generates a table using a BigQuery ML model. + + See the `AI.GENERATE_TABLE function syntax + `_ + for additional reference. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> # The user is responsible for constructing a DataFrame that contains + >>> # the necessary columns for the model's prompt. For example, a + >>> # DataFrame with a 'prompt' column for text classification. + >>> df = bpd.DataFrame({'prompt': ["some text to classify"]}) + >>> result = bbq.ai.generate_table( + ... "project.dataset.model_name", + ... data=df, + ... output_schema="category STRING" + ... ) # doctest: +SKIP + + Args: + model (bigframes.ml.base.BaseEstimator or str): + The model to use for table generation. + data (bigframes.pandas.DataFrame or pandas.DataFrame): + The data to use as input for table generation. It must contain the + columns that the model expects for constructing the prompt. + output_schema (str): + A string defining the output schema (e.g., "col1 STRING, col2 INT64"). + temperature (float, optional): + A FLOAT64 value that is used for sampling promiscuity. The value + must be in the range ``[0.0, 1.0]``. + top_p (float, optional): + A FLOAT64 value that changes how the model selects tokens for + output. + max_output_tokens (int, optional): + An INT64 value that sets the maximum number of tokens in the + generated text. + stop_sequences (List[str], optional): + An ARRAY value that contains the stop sequences for the model. + request_type (str, optional): + A STRING value that contains the request type for the model. + + Returns: + bigframes.pandas.DataFrame: + The generated table. + """ + data = _to_dataframe(data, series_rename="prompt") + model_name, session = bq_utils.get_model_name_and_session(model, data) + table_sql = bq_utils.to_sql(data) + + struct_fields_bq: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = { + "output_schema": output_schema + } + if temperature is not None: + struct_fields_bq["temperature"] = temperature + if top_p is not None: + struct_fields_bq["top_p"] = top_p + if max_output_tokens is not None: + struct_fields_bq["max_output_tokens"] = max_output_tokens + if stop_sequences is not None: + struct_fields_bq["stop_sequences"] = stop_sequences + if request_type is not None: + struct_fields_bq["request_type"] = request_type + + struct_sql = bigframes.core.sql.literals.struct_literal(struct_fields_bq) + query = f""" + SELECT * + FROM AI.GENERATE_TABLE( + MODEL `{model_name}`, + ({table_sql}), + {struct_sql} + ) + """ + + if session is None: + return bpd.read_gbq_query(query) + else: + return session.read_gbq_query(query) + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index 053ee7352a..bb24d5dc33 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -24,6 +24,7 @@ generate_double, generate_embedding, generate_int, + generate_table, generate_text, if_, score, @@ -37,6 +38,7 @@ "generate_double", "generate_embedding", "generate_int", + "generate_table", "generate_text", "if_", "score", diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index 0be32b9e8a..d0e5f76414 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -220,8 +220,57 @@ def test_generate_text_defaults(mock_dataframe, mock_session): assert "STRUCT()" in query +def test_generate_table_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_table( + model_name, + mock_dataframe, + output_schema="col1 STRING, col2 INT64", + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_TABLE(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT('col1 STRING, col2 INT64' AS output_schema)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_table_with_options(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_table( + model_name, + mock_dataframe, + output_schema="col1 STRING", + temperature=0.5, + max_output_tokens=100, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT * FROM my_table)" in query + assert ( + "STRUCT('col1 STRING' AS output_schema, 0.5 AS temperature, 100 AS max_output_tokens)" + in query + ) + + @mock.patch("bigframes.pandas.read_pandas") -def test_generate_text_with_pandas_dataframe( +def test_generate_table_with_pandas_dataframe( read_pandas_mock, mock_dataframe, mock_session ): # This tests that pandas input path works and calls read_pandas @@ -232,9 +281,10 @@ def test_generate_text_with_pandas_dataframe( pandas_df = pd.DataFrame({"content": ["test"]}) - bbq.ai.generate_text( + bbq.ai.generate_table( model_name, pandas_df, + output_schema="col1 STRING", ) read_pandas_mock.assert_called_once() From 1bbbd3d0f9c5191a268d96d710a7c1657904e0d9 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 11 Feb 2026 19:53:04 +0000 Subject: [PATCH 2/4] test --- tests/system/large/bigquery/test_ai.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py index e318a8a720..86cf4d7f00 100644 --- a/tests/system/large/bigquery/test_ai.py +++ b/tests/system/large/bigquery/test_ai.py @@ -94,3 +94,20 @@ def test_generate_text_with_options(text_model): # It basically asserts that the results are still returned. assert len(result) == 2 + + +def test_generate_table(text_model): + df = bpd.DataFrame( + {"prompt": ["Generate a table of 2 programming languages and their creators."]} + ) + + result = ai.generate_table( + text_model, + df, + output_schema="language STRING, creator STRING", + ) + + assert "language" in result.columns + assert "creator" in result.columns + # The model may not always return the exact number of rows requested. + assert len(result) > 0 From 35254874095e422e1eb25171c114b8a815ecf5d8 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 11 Feb 2026 19:55:24 +0000 Subject: [PATCH 3/4] docs --- bigframes/bigquery/_operations/ai.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 16ca989bf0..7f9c3eb55f 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -604,7 +604,7 @@ def generate_text( @log_adapter.method_logger(custom_base_name="bigquery_ai") def generate_table( model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], - data: Union[dataframe.DataFrame, pd.DataFrame], + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], *, output_schema: str, temperature: Optional[float] = None, @@ -637,9 +637,11 @@ def generate_table( Args: model (bigframes.ml.base.BaseEstimator or str): The model to use for table generation. - data (bigframes.pandas.DataFrame or pandas.DataFrame): - The data to use as input for table generation. It must contain the - columns that the model expects for constructing the prompt. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. output_schema (str): A string defining the output schema (e.g., "col1 STRING, col2 INT64"). temperature (float, optional): @@ -650,7 +652,7 @@ def generate_table( output. max_output_tokens (int, optional): An INT64 value that sets the maximum number of tokens in the - generated text. + generated table. stop_sequences (List[str], optional): An ARRAY value that contains the stop sequences for the model. request_type (str, optional): From a82d395caed717ea2833a7bb07177da379bbe8f7 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 11 Feb 2026 20:00:06 +0000 Subject: [PATCH 4/4] gemini.md --- GEMINI.md | 42 +++++++++++++++++++++++++++++++--- tests/unit/bigquery/test_ai.py | 5 ++-- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/GEMINI.md b/GEMINI.md index 0d74e277ad..0d447f17a4 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -2,12 +2,48 @@ ## Testing -We use `pytest` to instrument our tests. +We use `nox` to instrument our tests. -- To test your changes, run unit tests with `pytest`: +- To test your changes, run unit tests with `nox`: ```bash - pytest :: + nox -r -s unit + ``` + +- To run a single unit test: + + ```bash + nox -r -s unit-3.13 -- -k + ``` + +- Ignore this step if you lack access to Google Cloud resources. To run system + tests, you can execute:: + + # Run all system tests + $ nox -r -s system + + # Run a single system test + $ nox -r -s system-3.13 -- -k + +- The codebase must have better coverage than it had previously after each + change. You can test coverage via `nox -s unit system cover` (takes a long + time). Omit `system` if you lack access to cloud resources. + +## Code Style + +- We use the automatic code formatter `black`. You can run it using + the nox session `format`. This will eliminate many lint errors. Run via: + + ```bash + nox -r -s format + ``` + +- PEP8 compliance is required, with exceptions defined in the linter configuration. + If you have ``nox`` installed, you can test that you have not introduced + any non-compliant code via: + + ``` + nox -r -s lint ``` - When writing tests, use the idiomatic "pytest" style. diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index d0e5f76414..796e86f924 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -270,7 +270,7 @@ def test_generate_table_with_options(mock_dataframe, mock_session): @mock.patch("bigframes.pandas.read_pandas") -def test_generate_table_with_pandas_dataframe( +def test_generate_text_with_pandas_dataframe( read_pandas_mock, mock_dataframe, mock_session ): # This tests that pandas input path works and calls read_pandas @@ -281,10 +281,9 @@ def test_generate_table_with_pandas_dataframe( pandas_df = pd.DataFrame({"content": ["test"]}) - bbq.ai.generate_table( + bbq.ai.generate_text( model_name, pandas_df, - output_schema="col1 STRING", ) read_pandas_mock.assert_called_once()