|
19 | 19 | from __future__ import annotations |
20 | 20 |
|
21 | 21 | import json |
22 | | -from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union |
| 22 | +from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union |
23 | 23 |
|
24 | 24 | import pandas as pd |
25 | 25 |
|
|
28 | 28 | from bigframes import series, session |
29 | 29 | from bigframes.core import convert |
30 | 30 | from bigframes.core.logging import log_adapter |
| 31 | +import bigframes.core.sql.literals |
31 | 32 | from bigframes.ml import core as ml_core |
32 | 33 | from bigframes.operations import ai_ops, output_schemas |
33 | 34 |
|
@@ -388,6 +389,113 @@ def generate_double( |
388 | 389 | return series_list[0]._apply_nary_op(operator, series_list[1:]) |
389 | 390 |
|
390 | 391 |
|
| 392 | +@log_adapter.method_logger(custom_base_name="bigquery_ai") |
| 393 | +def generate_embedding( |
| 394 | + model_name: str, |
| 395 | + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], |
| 396 | + *, |
| 397 | + output_dimensionality: Optional[int] = None, |
| 398 | + task_type: Optional[str] = None, |
| 399 | + start_second: Optional[float] = None, |
| 400 | + end_second: Optional[float] = None, |
| 401 | + interval_seconds: Optional[float] = None, |
| 402 | + trial_id: Optional[int] = None, |
| 403 | +) -> dataframe.DataFrame: |
| 404 | + """ |
| 405 | + Creates embeddings that describe an entity—for example, a piece of text or an image. |
| 406 | +
|
| 407 | + **Examples:** |
| 408 | +
|
| 409 | + >>> import bigframes.pandas as bpd |
| 410 | + >>> import bigframes.bigquery as bbq |
| 411 | + >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]}) |
| 412 | + >>> bbq.ai.generate_embedding( |
| 413 | + ... "project.dataset.model_name", |
| 414 | + ... df |
| 415 | + ... ) # doctest: +SKIP |
| 416 | +
|
| 417 | + Args: |
| 418 | + model_name (str): |
| 419 | + The name of a remote model from Vertex AI, such as the |
| 420 | + multimodalembedding@001 model. |
| 421 | + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): |
| 422 | + The data to generate embeddings for. If a Series is provided, it is |
| 423 | + treated as the 'content' column. If a DataFrame is provided, it |
| 424 | + must contain a 'content' column, or you must rename the column you |
| 425 | + wish to embed to 'content'. |
| 426 | + output_dimensionality (int, optional): |
| 427 | + An INT64 value that specifies the number of dimensions to use when |
| 428 | + generating embeddings. For example, if you specify 256 AS |
| 429 | + output_dimensionality, then the embedding output column contains a |
| 430 | + 256-dimensional embedding for each input value. To find the |
| 431 | + supported range of output dimensions, read about the available |
| 432 | + `Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_. |
| 433 | + task_type (str, optional): |
| 434 | + A STRING literal that specifies the intended downstream application to |
| 435 | + help the model produce better quality embeddings. For a list of |
| 436 | + supported task types and how to choose which one to use, see `Choose an |
| 437 | + embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_. |
| 438 | + start_second (float, optional): |
| 439 | + The second in the video at which to start the embedding. The default value is 0. |
| 440 | + end_second (float, optional): |
| 441 | + The second in the video at which to end the embedding. The default value is 120. |
| 442 | + interval_seconds (float, optional): |
| 443 | + The interval to use when creating embeddings. The default value is 16. |
| 444 | + trial_id (int, optional): |
| 445 | + An INT64 value that identifies the hyperparameter tuning trial that |
| 446 | + you want the function to evaluate. The function uses the optimal |
| 447 | + trial by default. Only specify this argument if you ran |
| 448 | + hyperparameter tuning when creating the model. |
| 449 | +
|
| 450 | + Returns: |
| 451 | + bigframes.pandas.DataFrame: |
| 452 | + A new DataFrame with the generated embeddings. See the `SQL |
| 453 | + reference for AI.GENERATE_EMBEDDING |
| 454 | + <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_ |
| 455 | + for details. |
| 456 | + """ |
| 457 | + if isinstance(data, (pd.DataFrame, pd.Series)): |
| 458 | + data = bpd.read_pandas(data) |
| 459 | + |
| 460 | + if isinstance(data, series.Series): |
| 461 | + data = data.copy() |
| 462 | + data.name = "content" |
| 463 | + data_df = data.to_frame() |
| 464 | + elif isinstance(data, dataframe.DataFrame): |
| 465 | + data_df = data |
| 466 | + else: |
| 467 | + raise ValueError(f"Unsupported data type: {type(data)}") |
| 468 | + |
| 469 | + # We need to get the SQL for the input data to pass as a subquery to the TVF |
| 470 | + source_sql = data_df.sql |
| 471 | + |
| 472 | + struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {} |
| 473 | + if output_dimensionality is not None: |
| 474 | + struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality |
| 475 | + if task_type is not None: |
| 476 | + struct_fields["TASK_TYPE"] = task_type |
| 477 | + if start_second is not None: |
| 478 | + struct_fields["START_SECOND"] = start_second |
| 479 | + if end_second is not None: |
| 480 | + struct_fields["END_SECOND"] = end_second |
| 481 | + if interval_seconds is not None: |
| 482 | + struct_fields["INTERVAL_SECONDS"] = interval_seconds |
| 483 | + if trial_id is not None: |
| 484 | + struct_fields["TRIAL_ID"] = trial_id |
| 485 | + |
| 486 | + # Construct the TVF query |
| 487 | + query = f""" |
| 488 | + SELECT * |
| 489 | + FROM AI.GENERATE_EMBEDDING( |
| 490 | + MODEL `{model_name}`, |
| 491 | + ({source_sql}), |
| 492 | + {bigframes.core.sql.literals.struct_literal(struct_fields)}) |
| 493 | + ) |
| 494 | + """ |
| 495 | + |
| 496 | + return data_df._session.read_gbq(query) |
| 497 | + |
| 498 | + |
391 | 499 | @log_adapter.method_logger(custom_base_name="bigquery_ai") |
392 | 500 | def if_( |
393 | 501 | prompt: PROMPT_TYPE, |
|
0 commit comments