Skip to content

Commit ff25f8d

Browse files
Merge remote-tracking branch 'github/main' into iceberg_tables
2 parents 98a18e4 + e91536c commit ff25f8d

File tree

36 files changed

+1183
-307
lines changed

36 files changed

+1183
-307
lines changed

.librarian/state.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677
22
libraries:
33
- id: bigframes
4-
version: 2.33.0
4+
version: 2.34.0
55
last_generated_commit: ""
66
apis: []
77
source_roots:

CHANGELOG.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@
44

55
[1]: https://pypi.org/project/bigframes/#history
66

7+
## [2.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.33.0...v2.34.0) (2026-02-02)
8+
9+
10+
### Features
11+
12+
* add `bigframes.pandas.options.experiments.sql_compiler` for switching the backend compiler (#2417) ([7eba6ee03f07938315d99e2aeaf72368c02074cf](https://github.com/googleapis/python-bigquery-dataframes/commit/7eba6ee03f07938315d99e2aeaf72368c02074cf))
13+
* add bigquery.ml.generate_embedding function (#2422) ([35f3f5e6f8c64b47e6e7214034f96f047785e647](https://github.com/googleapis/python-bigquery-dataframes/commit/35f3f5e6f8c64b47e6e7214034f96f047785e647))
14+
* add bigquery.create_external_table method (#2415) ([76db2956e505aec4f1055118ac7ca523facc10ff](https://github.com/googleapis/python-bigquery-dataframes/commit/76db2956e505aec4f1055118ac7ca523facc10ff))
15+
* add deprecation warnings for .blob accessor and read_gbq_object_table (#2408) ([7261a4ea5cdab6b30f5bc333501648c60e70be59](https://github.com/googleapis/python-bigquery-dataframes/commit/7261a4ea5cdab6b30f5bc333501648c60e70be59))
16+
* add bigquery.ml.generate_text function (#2403) ([5ac681028624de15e31f0c2ae360b47b2dcf1e8d](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac681028624de15e31f0c2ae360b47b2dcf1e8d))
17+
18+
19+
### Bug Fixes
20+
21+
* broken job url (#2411) ([fcb5bc1761c656e1aec61dbcf96a36d436833b7a](https://github.com/googleapis/python-bigquery-dataframes/commit/fcb5bc1761c656e1aec61dbcf96a36d436833b7a))
22+
723
## [2.33.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.32.0...v2.33.0) (2026-01-22)
824

925

bigframes/_config/experiment_options.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from typing import Optional
15+
from typing import Literal, Optional
1616
import warnings
1717

1818
import bigframes
@@ -27,6 +27,7 @@ class ExperimentOptions:
2727
def __init__(self):
2828
self._semantic_operators: bool = False
2929
self._ai_operators: bool = False
30+
self._sql_compiler: Literal["legacy", "stable", "experimental"] = "stable"
3031

3132
@property
3233
def semantic_operators(self) -> bool:
@@ -55,6 +56,24 @@ def ai_operators(self, value: bool):
5556
warnings.warn(msg, category=bfe.PreviewWarning)
5657
self._ai_operators = value
5758

59+
@property
60+
def sql_compiler(self) -> Literal["legacy", "stable", "experimental"]:
61+
return self._sql_compiler
62+
63+
@sql_compiler.setter
64+
def sql_compiler(self, value: Literal["legacy", "stable", "experimental"]):
65+
if value not in ["legacy", "stable", "experimental"]:
66+
raise ValueError(
67+
"sql_compiler must be one of 'legacy', 'stable', or 'experimental'"
68+
)
69+
if value == "experimental":
70+
msg = bfe.format_message(
71+
"The experimental SQL compiler is still under experiments, and is subject "
72+
"to change in the future."
73+
)
74+
warnings.warn(msg, category=FutureWarning)
75+
self._sql_compiler = value
76+
5877
@property
5978
def blob(self) -> bool:
6079
msg = bfe.format_message(

bigframes/bigquery/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
st_regionstats,
4444
st_simplify,
4545
)
46+
from bigframes.bigquery._operations.io import load_data
4647
from bigframes.bigquery._operations.json import (
4748
json_extract,
4849
json_extract_array,
@@ -60,7 +61,7 @@
6061
from bigframes.bigquery._operations.search import create_vector_index, vector_search
6162
from bigframes.bigquery._operations.sql import sql_scalar
6263
from bigframes.bigquery._operations.struct import struct
63-
from bigframes.bigquery.table import create_external_table
64+
from bigframes.bigquery._operations.table import create_external_table
6465
from bigframes.core.logging import log_adapter
6566

6667
_functions = [
@@ -107,6 +108,8 @@
107108
struct,
108109
# table ops
109110
create_external_table,
111+
# io ops
112+
load_data,
110113
]
111114

112115
_module = sys.modules[__name__]
@@ -160,6 +163,8 @@
160163
"struct",
161164
# table ops
162165
"create_external_table",
166+
# io ops
167+
"load_data",
163168
# Modules / SQL namespaces
164169
"ai",
165170
"ml",

bigframes/bigquery/_operations/ai.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from __future__ import annotations
2020

2121
import json
22-
from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
22+
from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union
2323

2424
import pandas as pd
2525

@@ -28,6 +28,7 @@
2828
from bigframes import series, session
2929
from bigframes.core import convert
3030
from bigframes.core.logging import log_adapter
31+
import bigframes.core.sql.literals
3132
from bigframes.ml import core as ml_core
3233
from bigframes.operations import ai_ops, output_schemas
3334

@@ -388,6 +389,113 @@ def generate_double(
388389
return series_list[0]._apply_nary_op(operator, series_list[1:])
389390

390391

392+
@log_adapter.method_logger(custom_base_name="bigquery_ai")
393+
def generate_embedding(
394+
model_name: str,
395+
data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
396+
*,
397+
output_dimensionality: Optional[int] = None,
398+
task_type: Optional[str] = None,
399+
start_second: Optional[float] = None,
400+
end_second: Optional[float] = None,
401+
interval_seconds: Optional[float] = None,
402+
trial_id: Optional[int] = None,
403+
) -> dataframe.DataFrame:
404+
"""
405+
Creates embeddings that describe an entity—for example, a piece of text or an image.
406+
407+
**Examples:**
408+
409+
>>> import bigframes.pandas as bpd
410+
>>> import bigframes.bigquery as bbq
411+
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
412+
>>> bbq.ai.generate_embedding(
413+
... "project.dataset.model_name",
414+
... df
415+
... ) # doctest: +SKIP
416+
417+
Args:
418+
model_name (str):
419+
The name of a remote model from Vertex AI, such as the
420+
multimodalembedding@001 model.
421+
data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
422+
The data to generate embeddings for. If a Series is provided, it is
423+
treated as the 'content' column. If a DataFrame is provided, it
424+
must contain a 'content' column, or you must rename the column you
425+
wish to embed to 'content'.
426+
output_dimensionality (int, optional):
427+
An INT64 value that specifies the number of dimensions to use when
428+
generating embeddings. For example, if you specify 256 AS
429+
output_dimensionality, then the embedding output column contains a
430+
256-dimensional embedding for each input value. To find the
431+
supported range of output dimensions, read about the available
432+
`Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
433+
task_type (str, optional):
434+
A STRING literal that specifies the intended downstream application to
435+
help the model produce better quality embeddings. For a list of
436+
supported task types and how to choose which one to use, see `Choose an
437+
embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
438+
start_second (float, optional):
439+
The second in the video at which to start the embedding. The default value is 0.
440+
end_second (float, optional):
441+
The second in the video at which to end the embedding. The default value is 120.
442+
interval_seconds (float, optional):
443+
The interval to use when creating embeddings. The default value is 16.
444+
trial_id (int, optional):
445+
An INT64 value that identifies the hyperparameter tuning trial that
446+
you want the function to evaluate. The function uses the optimal
447+
trial by default. Only specify this argument if you ran
448+
hyperparameter tuning when creating the model.
449+
450+
Returns:
451+
bigframes.pandas.DataFrame:
452+
A new DataFrame with the generated embeddings. See the `SQL
453+
reference for AI.GENERATE_EMBEDDING
454+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
455+
for details.
456+
"""
457+
if isinstance(data, (pd.DataFrame, pd.Series)):
458+
data = bpd.read_pandas(data)
459+
460+
if isinstance(data, series.Series):
461+
data = data.copy()
462+
data.name = "content"
463+
data_df = data.to_frame()
464+
elif isinstance(data, dataframe.DataFrame):
465+
data_df = data
466+
else:
467+
raise ValueError(f"Unsupported data type: {type(data)}")
468+
469+
# We need to get the SQL for the input data to pass as a subquery to the TVF
470+
source_sql = data_df.sql
471+
472+
struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {}
473+
if output_dimensionality is not None:
474+
struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality
475+
if task_type is not None:
476+
struct_fields["TASK_TYPE"] = task_type
477+
if start_second is not None:
478+
struct_fields["START_SECOND"] = start_second
479+
if end_second is not None:
480+
struct_fields["END_SECOND"] = end_second
481+
if interval_seconds is not None:
482+
struct_fields["INTERVAL_SECONDS"] = interval_seconds
483+
if trial_id is not None:
484+
struct_fields["TRIAL_ID"] = trial_id
485+
486+
# Construct the TVF query
487+
query = f"""
488+
SELECT *
489+
FROM AI.GENERATE_EMBEDDING(
490+
MODEL `{model_name}`,
491+
({source_sql}),
492+
{bigframes.core.sql.literals.struct_literal(struct_fields)})
493+
)
494+
"""
495+
496+
return data_df._session.read_gbq(query)
497+
498+
391499
@log_adapter.method_logger(custom_base_name="bigquery_ai")
392500
def if_(
393501
prompt: PROMPT_TYPE,
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Mapping, Optional, Union
18+
19+
import pandas as pd
20+
21+
from bigframes.bigquery._operations.table import _get_table_metadata
22+
import bigframes.core.logging.log_adapter as log_adapter
23+
import bigframes.core.sql.io
24+
import bigframes.session
25+
26+
27+
@log_adapter.method_logger(custom_base_name="bigquery_io")
28+
def load_data(
29+
table_name: str,
30+
*,
31+
write_disposition: str = "INTO",
32+
columns: Optional[Mapping[str, str]] = None,
33+
partition_by: Optional[list[str]] = None,
34+
cluster_by: Optional[list[str]] = None,
35+
table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None,
36+
from_files_options: Mapping[str, Union[str, int, float, bool, list]],
37+
with_partition_columns: Optional[Mapping[str, str]] = None,
38+
connection_name: Optional[str] = None,
39+
session: Optional[bigframes.session.Session] = None,
40+
) -> pd.Series:
41+
"""
42+
Loads data into a BigQuery table.
43+
See the `BigQuery LOAD DATA DDL syntax
44+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/load-statements>`_
45+
for additional reference.
46+
Args:
47+
table_name (str):
48+
The name of the table in BigQuery.
49+
write_disposition (str, default "INTO"):
50+
Whether to replace the table if it already exists ("OVERWRITE") or append to it ("INTO").
51+
columns (Mapping[str, str], optional):
52+
The table's schema.
53+
partition_by (list[str], optional):
54+
A list of partition expressions to partition the table by. See https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#partition_expression.
55+
cluster_by (list[str], optional):
56+
A list of columns to cluster the table by.
57+
table_options (Mapping[str, Union[str, int, float, bool, list]], optional):
58+
The table options.
59+
from_files_options (Mapping[str, Union[str, int, float, bool, list]]):
60+
The options for loading data from files.
61+
with_partition_columns (Mapping[str, str], optional):
62+
The table's partition columns.
63+
connection_name (str, optional):
64+
The connection to use for the table.
65+
session (bigframes.session.Session, optional):
66+
The session to use. If not provided, the default session is used.
67+
Returns:
68+
pandas.Series:
69+
A Series with object dtype containing the table metadata. Reference
70+
the `BigQuery Table REST API reference
71+
<https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table>`_
72+
for available fields.
73+
"""
74+
import bigframes.pandas as bpd
75+
76+
sql = bigframes.core.sql.io.load_data_ddl(
77+
table_name=table_name,
78+
write_disposition=write_disposition,
79+
columns=columns,
80+
partition_by=partition_by,
81+
cluster_by=cluster_by,
82+
table_options=table_options,
83+
from_files_options=from_files_options,
84+
with_partition_columns=with_partition_columns,
85+
connection_name=connection_name,
86+
)
87+
88+
if session is None:
89+
bpd.read_gbq_query(sql)
90+
session = bpd.get_global_session()
91+
else:
92+
session.read_gbq_query(sql)
93+
94+
return _get_table_metadata(bqclient=session.bqclient, table_name=table_name)

bigframes/bigquery/_operations/ml.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,3 +520,63 @@ def generate_text(
520520
return bpd.read_gbq_query(sql)
521521
else:
522522
return session.read_gbq_query(sql)
523+
524+
525+
@log_adapter.method_logger(custom_base_name="bigquery_ml")
526+
def generate_embedding(
527+
model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
528+
input_: Union[pd.DataFrame, dataframe.DataFrame, str],
529+
*,
530+
flatten_json_output: Optional[bool] = None,
531+
task_type: Optional[str] = None,
532+
output_dimensionality: Optional[int] = None,
533+
) -> dataframe.DataFrame:
534+
"""
535+
Generates text embedding using a BigQuery ML model.
536+
537+
See the `BigQuery ML GENERATE_EMBEDDING function syntax
538+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding>`_
539+
for additional reference.
540+
541+
Args:
542+
model (bigframes.ml.base.BaseEstimator or str):
543+
The model to use for text embedding.
544+
input_ (Union[bigframes.pandas.DataFrame, str]):
545+
The DataFrame or query to use for text embedding.
546+
flatten_json_output (bool, optional):
547+
A BOOL value that determines the content of the generated JSON column.
548+
task_type (str, optional):
549+
A STRING value that specifies the intended downstream application task.
550+
Supported values are:
551+
- `RETRIEVAL_QUERY`
552+
- `RETRIEVAL_DOCUMENT`
553+
- `SEMANTIC_SIMILARITY`
554+
- `CLASSIFICATION`
555+
- `CLUSTERING`
556+
- `QUESTION_ANSWERING`
557+
- `FACT_VERIFICATION`
558+
- `CODE_RETRIEVAL_QUERY`
559+
output_dimensionality (int, optional):
560+
An INT64 value that specifies the size of the output embedding.
561+
562+
Returns:
563+
bigframes.pandas.DataFrame:
564+
The generated text embedding.
565+
"""
566+
import bigframes.pandas as bpd
567+
568+
model_name, session = _get_model_name_and_session(model, input_)
569+
table_sql = _to_sql(input_)
570+
571+
sql = bigframes.core.sql.ml.generate_embedding(
572+
model_name=model_name,
573+
table=table_sql,
574+
flatten_json_output=flatten_json_output,
575+
task_type=task_type,
576+
output_dimensionality=output_dimensionality,
577+
)
578+
579+
if session is None:
580+
return bpd.read_gbq_query(sql)
581+
else:
582+
return session.read_gbq_query(sql)

0 commit comments

Comments
 (0)