diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index 3d3e2d0a6f..a578910b65 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,51 @@ " runtime = bbq.obj.get_access_url(s, mode=mode)\n", " \n", " # 3. Convert the runtime object to a JSON string\n", - " return bbq.to_json_string(runtime)" + " return bbq.to_json_string(runtime)\n", + "\n", + "def get_metadata(series):\n", + " # Fetch metadata and extract GCS metadata from the details JSON field\n", + " metadata_obj = bbq.obj.fetch_metadata(series)\n", + " return bbq.json_query(metadata_obj.struct.field(\"details\"), \"$.gcs_metadata\")\n", + "\n", + "def get_content_type(series):\n", + " return bbq.json_value(get_metadata(series), \"$.content_type\")\n", + "\n", + "def get_size(series):\n", + " return bbq.json_value(get_metadata(series), \"$.size\").astype(\"Int64\")\n", + "\n", + "def get_updated(series):\n", + " return bpd.to_datetime(bbq.json_value(get_metadata(series), \"$.updated\").astype(\"Int64\"), unit=\"us\", utc=True)\n", + "\n", + "def display_blob(series, n=3):\n", + " import IPython.display as ipy_display\n", + " import pandas as pd\n", + " import requests\n", + " \n", + " # Retrieve access URLs and content types\n", + " runtime_json = bbq.to_json_string(bbq.obj.get_access_url(series, mode=\"R\"))\n", + " read_url = bbq.json_value(runtime_json, \"$.access_urls.read_url\")\n", + " content_type = get_content_type(series)\n", + " \n", + " # Pull to pandas to display\n", + " pdf = bpd.DataFrame({\"read_url\": read_url, \"content_type\": content_type}).head(n).to_pandas()\n", + " \n", + " width = bigframes.options.display.blob_display_width\n", + " height = bigframes.options.display.blob_display_height\n", + " \n", + " for _, row in pdf.iterrows():\n", + " if pd.isna(row[\"read_url\"]):\n", + " ipy_display.display(\"\")\n", + " elif pd.isna(row[\"content_type\"]):\n", + " ipy_display.display(requests.get(row[\"read_url\"]).content)\n", + " elif row[\"content_type\"].casefold().startswith(\"image\"):\n", + " ipy_display.display(ipy_display.Image(url=row[\"read_url\"], width=width, height=height))\n", + " elif row[\"content_type\"].casefold().startswith(\"audio\"):\n", + " ipy_display.display(ipy_display.Audio(requests.get(row[\"read_url\"]).content))\n", + " elif row[\"content_type\"].casefold().startswith(\"video\"):\n", + " ipy_display.display(ipy_display.Video(row[\"read_url\"], width=width, height=height))\n", + " else:\n", + " ipy_display.display(requests.get(row[\"read_url\"]).content)" ] }, { @@ -172,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -180,20 +224,7 @@ "id": "fx6YcZJbeYru", "outputId": "d707954a-0dd0-4c50-b7bf-36b140cf76cf" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - } - ], + "outputs": [], "source": [ "# Create blob columns from wildcard path.\n", "df_image = bpd.from_glob_path(\n", @@ -209,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -223,10 +254,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" ] }, { @@ -256,23 +289,23 @@ " \n", " \n", " 0\n", - " \n", + " \n", " \n", " \n", " 1\n", - " \n", + " \n", " \n", " \n", " 2\n", - " \n", + " \n", " \n", " \n", " 3\n", - " \n", + " \n", " \n", " \n", " 4\n", - " \n", + " \n", " \n", " \n", "\n", @@ -281,16 +314,16 @@ ], "text/plain": [ " image\n", - "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", - "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", - "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", - "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", - "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", "\n", "[5 rows x 1 columns]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "id": "YYYVn7NDH0Me" }, @@ -330,35 +363,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", - "version. Use `json_query` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", - "version. Use `json_query` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", - "version. Use `json_query` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" ] }, { @@ -392,7 +402,7 @@ " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", @@ -400,7 +410,7 @@ " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", @@ -408,7 +418,7 @@ " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", @@ -416,7 +426,7 @@ " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", @@ -424,7 +434,7 @@ " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", @@ -437,11 +447,11 @@ ], "text/plain": [ " image author content_type \\\n", - "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", - "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", - "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", - "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", - "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... alice image/png \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... bob image/png \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... bob image/png \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... alice image/png \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... bob image/png \n", "\n", " size updated \n", "0 1591240 2025-03-20 17:45:04+00:00 \n", @@ -453,17 +463,18 @@ "[5 rows x 5 columns]" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Combine unstructured data with structured data\n", + "df_image = df_image.head(5)\n", "df_image[\"author\"] = [\"alice\", \"bob\", \"bob\", \"alice\", \"bob\"] # type: ignore\n", - "df_image[\"content_type\"] = df_image[\"image\"].blob.content_type()\n", - "df_image[\"size\"] = df_image[\"image\"].blob.size()\n", - "df_image[\"updated\"] = df_image[\"image\"].blob.updated()\n", + "df_image[\"content_type\"] = get_content_type(df_image[\"image\"])\n", + "df_image[\"size\"] = get_size(df_image[\"image\"])\n", + "df_image[\"updated\"] = get_updated(df_image[\"image\"])\n", "df_image" ] }, @@ -478,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -488,31 +499,10 @@ "outputId": "73feb33d-4a05-48fb-96e5-3c48c2a456f3" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", - "version. Use `json_query` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" - ] - }, { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -524,7 +514,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -536,7 +526,7 @@ ], "source": [ "# filter images and display, you can also display audio and video types\n", - "df_image[df_image[\"author\"] == \"alice\"][\"image\"].blob.display()" + "display_blob(df_image[df_image[\"author\"] == \"alice\"][\"image\"])" ] }, {