diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index b98a5e7337..3d3e2d0a6f 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -1317,161 +1317,119 @@ "id": "iRUi8AjG7cIf" }, "source": [ - "### 5. PDF chunking function" + "### 5. PDF extraction and chunking function\n", + "\n", + "This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library." ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "oDDuYtUm5Yiy" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7jLpMYaj7nj8", - "outputId": "06d5456f-580f-4693-adff-2605104b056c" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", - "future version. Use `json_value_array` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", - "future version. Use `json_value_array` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" - ] - } - ], - "source": [ - "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")" + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "@bpd.udf(\n", + " input_types=[str],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"pdf_extract\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pypdf\", \"requests\", \"cryptography\"],\n", + ")\n", + "def pdf_extract(src_obj_ref_rt: str) -> str:\n", + " import io\n", + " import json\n", + " from pypdf import PdfReader\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30, stream=True)\n", + " response.raise_for_status()\n", + " pdf_bytes = response.content\n", + " pdf_file = io.BytesIO(pdf_bytes)\n", + " reader = PdfReader(pdf_file, strict=False)\n", + " all_text = \"\"\n", + " for page in reader.pages:\n", + " page_extract_text = page.extract_text()\n", + " if page_extract_text:\n", + " all_text += page_extract_text\n", + " return all_text\n", + "\n", + "@bpd.udf(\n", + " input_types=[str, int, int],\n", + " output_type=list[str],\n", + " dataset=DATASET_ID,\n", + " name=\"pdf_chunk\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pypdf\", \"requests\", \"cryptography\"],\n", + ")\n", + "def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n", + " import io\n", + " import json\n", + " from pypdf import PdfReader\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30, stream=True)\n", + " response.raise_for_status()\n", + " pdf_bytes = response.content\n", + " pdf_file = io.BytesIO(pdf_bytes)\n", + " reader = PdfReader(pdf_file, strict=False)\n", + " all_text_chunks = []\n", + " curr_chunk = \"\"\n", + " for page in reader.pages:\n", + " page_text = page.extract_text()\n", + " if page_text:\n", + " curr_chunk += page_text\n", + " while len(curr_chunk) >= chunk_size:\n", + " split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n", + " if split_idx == -1:\n", + " split_idx = chunk_size\n", + " actual_chunk = curr_chunk[:split_idx]\n", + " all_text_chunks.append(actual_chunk)\n", + " overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n", + " curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n", + " if curr_chunk:\n", + " all_text_chunks.append(curr_chunk)\n", + " return all_text_chunks" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", - "future version. Use `json_value_array` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
chunked_verbose
0{'status': '', 'content': array([\"CritterCuisi...
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " chunked_verbose\n", - "0 {'status': '', 'content': array([\"CritterCuisi...\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n", - "df_pdf[[\"chunked_verbose\"]]" + "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n", + "\n", + "# Generate a JSON string containing the runtime information (including signed read URLs)\n", + "access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n", + "\n", + "# Apply PDF extraction\n", + "df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n", + "\n", + "# Apply PDF chunking\n", + "df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n", + "\n", + "df_pdf[[\"extracted_text\", \"chunked\"]]" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "kaPvJATN7zlw" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/plain": [ - "0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n", - "0 on a level, stable surface to prevent tipping....\n", - "0 included)\\nto maintain the schedule during pow...\n", - "0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n", - "0 paperclip) for 5\\nseconds. This will reset all...\n", - "0 unit with a damp cloth. Do not immerse the bas...\n", - "0 continues,\\ncontact customer support.\\nE2: Foo...\n", - "Name: chunked, dtype: string" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ + "# Explode the chunks to see each chunk as a separate row\n", "chunked = df_pdf[\"chunked\"].explode()\n", "chunked" ] @@ -1674,7 +1632,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.0" + "version": "3.10.15" } }, "nbformat": 4,