diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index b98a5e7337..3d3e2d0a6f 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -1317,161 +1317,119 @@ "id": "iRUi8AjG7cIf" }, "source": [ - "### 5. PDF chunking function" + "### 5. PDF extraction and chunking function\n", + "\n", + "This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library." ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "oDDuYtUm5Yiy" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7jLpMYaj7nj8", - "outputId": "06d5456f-580f-4693-adff-2605104b056c" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", - "future version. Use `json_value_array` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", - "future version. Use `json_value_array` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" - ] - } - ], - "source": [ - "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")" + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "@bpd.udf(\n", + " input_types=[str],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"pdf_extract\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pypdf\", \"requests\", \"cryptography\"],\n", + ")\n", + "def pdf_extract(src_obj_ref_rt: str) -> str:\n", + " import io\n", + " import json\n", + " from pypdf import PdfReader\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30, stream=True)\n", + " response.raise_for_status()\n", + " pdf_bytes = response.content\n", + " pdf_file = io.BytesIO(pdf_bytes)\n", + " reader = PdfReader(pdf_file, strict=False)\n", + " all_text = \"\"\n", + " for page in reader.pages:\n", + " page_extract_text = page.extract_text()\n", + " if page_extract_text:\n", + " all_text += page_extract_text\n", + " return all_text\n", + "\n", + "@bpd.udf(\n", + " input_types=[str, int, int],\n", + " output_type=list[str],\n", + " dataset=DATASET_ID,\n", + " name=\"pdf_chunk\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pypdf\", \"requests\", \"cryptography\"],\n", + ")\n", + "def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n", + " import io\n", + " import json\n", + " from pypdf import PdfReader\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30, stream=True)\n", + " response.raise_for_status()\n", + " pdf_bytes = response.content\n", + " pdf_file = io.BytesIO(pdf_bytes)\n", + " reader = PdfReader(pdf_file, strict=False)\n", + " all_text_chunks = []\n", + " curr_chunk = \"\"\n", + " for page in reader.pages:\n", + " page_text = page.extract_text()\n", + " if page_text:\n", + " curr_chunk += page_text\n", + " while len(curr_chunk) >= chunk_size:\n", + " split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n", + " if split_idx == -1:\n", + " split_idx = chunk_size\n", + " actual_chunk = curr_chunk[:split_idx]\n", + " all_text_chunks.append(actual_chunk)\n", + " overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n", + " curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n", + " if curr_chunk:\n", + " all_text_chunks.append(curr_chunk)\n", + " return all_text_chunks" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", - "future version. Use `json_value_array` instead.\n", - " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
| \n", - " | chunked_verbose | \n", - "
|---|---|
| 0 | \n", - "{'status': '', 'content': array([\"CritterCuisi... | \n", - "
1 rows × 1 columns
\n", - "