Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 97 additions & 139 deletions notebooks/multimodal/multimodal_dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1317,161 +1317,119 @@
"id": "iRUi8AjG7cIf"
},
"source": [
"### 5. PDF chunking function"
"### 5. PDF extraction and chunking function\n",
"\n",
"This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "oDDuYtUm5Yiy"
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7jLpMYaj7nj8",
"outputId": "06d5456f-580f-4693-adff-2605104b056c"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
"instead of using `db_dtypes` in the future when available in pandas\n",
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
" return method(*args, **kwargs)\n",
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
"future version. Use `json_value_array` instead.\n",
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n",
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
"future version. Use `json_value_array` instead.\n",
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
]
}
],
"source": [
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
"# Construct the canonical connection ID\n",
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
"\n",
"@bpd.udf(\n",
" input_types=[str],\n",
" output_type=str,\n",
" dataset=DATASET_ID,\n",
" name=\"pdf_extract\",\n",
" bigquery_connection=FULL_CONNECTION_ID,\n",
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
")\n",
"def pdf_extract(src_obj_ref_rt: str) -> str:\n",
" import io\n",
" import json\n",
" from pypdf import PdfReader\n",
" import requests\n",
" from requests import adapters\n",
" session = requests.Session()\n",
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
" response = session.get(src_url, timeout=30, stream=True)\n",
" response.raise_for_status()\n",
" pdf_bytes = response.content\n",
" pdf_file = io.BytesIO(pdf_bytes)\n",
" reader = PdfReader(pdf_file, strict=False)\n",
" all_text = \"\"\n",
" for page in reader.pages:\n",
" page_extract_text = page.extract_text()\n",
" if page_extract_text:\n",
" all_text += page_extract_text\n",
" return all_text\n",
"\n",
"@bpd.udf(\n",
" input_types=[str, int, int],\n",
" output_type=list[str],\n",
" dataset=DATASET_ID,\n",
" name=\"pdf_chunk\",\n",
" bigquery_connection=FULL_CONNECTION_ID,\n",
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
")\n",
"def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n",
" import io\n",
" import json\n",
" from pypdf import PdfReader\n",
" import requests\n",
" from requests import adapters\n",
" session = requests.Session()\n",
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
" response = session.get(src_url, timeout=30, stream=True)\n",
" response.raise_for_status()\n",
" pdf_bytes = response.content\n",
" pdf_file = io.BytesIO(pdf_bytes)\n",
" reader = PdfReader(pdf_file, strict=False)\n",
" all_text_chunks = []\n",
" curr_chunk = \"\"\n",
" for page in reader.pages:\n",
" page_text = page.extract_text()\n",
" if page_text:\n",
" curr_chunk += page_text\n",
" while len(curr_chunk) >= chunk_size:\n",
" split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n",
" if split_idx == -1:\n",
" split_idx = chunk_size\n",
" actual_chunk = curr_chunk[:split_idx]\n",
" all_text_chunks.append(actual_chunk)\n",
" overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n",
" curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n",
" if curr_chunk:\n",
" all_text_chunks.append(curr_chunk)\n",
" return all_text_chunks"
]
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
"instead of using `db_dtypes` in the future when available in pandas\n",
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
" return method(*args, **kwargs)\n",
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
"future version. Use `json_value_array` instead.\n",
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chunked_verbose</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>{'status': '', 'content': array([\"CritterCuisi...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 1 columns</p>\n",
"</div>[1 rows x 1 columns in total]"
],
"text/plain": [
" chunked_verbose\n",
"0 {'status': '', 'content': array([\"CritterCuisi...\n",
"\n",
"[1 rows x 1 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n",
"df_pdf[[\"chunked_verbose\"]]"
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n",
"\n",
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
"access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n",
"\n",
"# Apply PDF extraction\n",
"df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n",
"\n",
"# Apply PDF chunking\n",
"df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n",
"\n",
"df_pdf[[\"extracted_text\", \"chunked\"]]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"id": "kaPvJATN7zlw"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
"instead of using `db_dtypes` in the future when available in pandas\n",
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
]
},
{
"data": {
"text/plain": [
"0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
"0 on a level, stable surface to prevent tipping....\n",
"0 included)\\nto maintain the schedule during pow...\n",
"0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
"0 paperclip) for 5\\nseconds. This will reset all...\n",
"0 unit with a damp cloth. Do not immerse the bas...\n",
"0 continues,\\ncontact customer support.\\nE2: Foo...\n",
"Name: chunked, dtype: string"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Explode the chunks to see each chunk as a separate row\n",
"chunked = df_pdf[\"chunked\"].explode()\n",
"chunked"
]
Expand Down Expand Up @@ -1674,7 +1632,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
Loading