From 7ce3de1eee7f4abdbb969c0c11c27eb07b30f221 Mon Sep 17 00:00:00 2001 From: andrespalate12-eng Date: Sun, 24 Aug 2025 21:26:24 -0500 Subject: [PATCH 1/2] Main primera parte actualizada --- lessons/02_web_scraping.ipynb | 461 +++++++++++++++++++++++++++++----- 1 file changed, 401 insertions(+), 60 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 385806a..09599f8 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -45,20 +45,68 @@ "We will use two main packages: [Requests](http://docs.python-requests.org/en/latest/user/quickstart/) and [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). Go ahead and install these packages, if you haven't already:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "La siguiente sección corresponde a la instalación de los paquetes necesarios: requests, beautifulsoup4 y lxml.\n", + "## Funcion del request\n", + "\n", + "* Permite realizar peticiones HTTP de manera sencilla y manejar las respuestas devueltas por un servidor web.\n", + "* Facilita operaciones como enviar parámetros, encabezados y autenticación en las solicitudes.\n", + "\n", + "## Funcion del beautifulsoup4\n", + "\n", + "* Se utiliza para extraer y procesar información de páginas web obtenidas mediante una solicitud HTTP.\n", + "* Permite navegar y buscar de forma intuitiva entre etiquetas, atributos y textos dentro del código HTML.\n", + "\n", + "## Funcion del lxml\n", + "\n", + "* Es una librería especializada en el procesamiento y manipulación de documentos XML y HTML.\n", + "* Resulta muy eficiente en tareas que requieren analizar, transformar o extraer datos estructurados en forma de árbol.\n", + "\n", + "\n" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: requests in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (2.31.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (3.4.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (2025.8.3)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install requests" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: beautifulsoup4 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (4.13.4)\n", + "Requirement already satisfied: soupsieve>1.2 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from beautifulsoup4) (2.7)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from beautifulsoup4) (4.14.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install beautifulsoup4" ] @@ -72,16 +120,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: lxml in c:\\users\\user\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (6.0.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install lxml" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importación de librerías\n", + "\n", + "En esta sección se importan las librerías necesarias que serán utilizadas en el script para realizar web scraping, manipulación de fechas y control de tiempo en la ejecución.\n", + "\n", + "- **from bs4 import BeautifulSoup** → importa BeautifulSoup, que se usará para analizar y extraer información de documentos HTML. \n", + "- **from datetime import datetime** → permite trabajar con fechas y horas, como obtener la fecha actual o formatear timestamps. \n", + "- **import requests** → se utiliza para realizar peticiones HTTP y obtener el contenido de páginas web. \n", + "- **import time** → proporciona funciones para controlar pausas en la ejecución del script, por ejemplo usando `time.sleep()`.\n", + "\n" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "tags": [] }, @@ -124,12 +196,33 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { "tags": [] }, - "outputs": [], + "source": [ + "Este bloque realiza un request HTTP mediante el metodo GET a un sitio web y muestra parte del contenido recibido. \n", + "\n", + "- **req = requests.get(http://www.ilga.gov/senate/default.asp)** → Realiza una solicitud HTTP GET a la URL especificada\n", + "\n", + "- **src = req.text** → Obtiene el contenido de la respuesta del servidor en formato de texto (HTML) \n", + "\n", + "- **print(src[:1000])** → Muestra los primeros 1000 caracteres del contenido para ver una vista previa" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# Make a GET request\n", "req = requests.get('http://www.ilga.gov/senate/default.asp')\n", @@ -150,11 +243,27 @@ "If you run into an error about a parser library, make sure you've installed the `lxml` package to provide Beautiful Soup with the necessary parsing tools." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicacion del bloque de codigo ##\n", + "Este bloque convierte la respuesta obtenida del servidor en un árbol HTML utilizando *BeautifulSoup*, lo que facilita navegar y extraer información de la página de los primeros 1000 caracteres.\n" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# Parse the response into an HTML tree\n", "soup = BeautifulSoup(src, 'lxml')\n", @@ -188,11 +297,28 @@ "What does the example below do?" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "\n", + "Busca todos los elementos con la etiqueta `` dentro del árbol HTML y muestra únicamente los primeros 10 resultados encontrados." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "# Find all elements with a certain tag\n", "a_tags = soup.find_all(\"a\")\n", @@ -208,13 +334,33 @@ "These two lines of code are equivalent:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se buscan todos los elementos con la etiqueta `` usando dos formas equivalentes (`soup.find_all(\"a\")` y `soup(\"a\")`) y se imprime el primer elemento obtenido en cada caso. Por ultimo, imprime la cantidad de elementos del primer caso." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m a_tags = soup.find_all(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 2\u001b[39m a_tags_alt = soup(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43ma_tags\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(a_tags_alt[\u001b[32m0\u001b[39m])\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "a_tags = soup.find_all(\"a\")\n", "a_tags_alt = soup(\"a\")\n", @@ -231,9 +377,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], "source": [ "print(len(a_tags))" ] @@ -249,16 +403,43 @@ "We can do this by adding an additional argument to the `find_all`. In the example below, we are finding all the `a` tags, and then filtering those with `class_=\"sidemenu\"`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se buscan específicamente las etiquetas `` que pertenecen a la clase `sidemenu` dentro del HTML. Primero se usa el codigo `soup(\"a\", class_=\"sidemenu\")` y luego la sintaxis de selectores CSS con `soup.select(\"a.sidemenu\")`. En ambos casos se mostraria solo los primeros 5 resultados encontrados, sin embargo, no encuentra ninguna clase sidemenu por lo que no muestra algun dato y la cantidad aparece en 0." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + }, + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get only the 'a' tags in 'sidemenu' class\n", "side_menus = soup(\"a\", class_=\"sidemenu\")\n", + "print(len(side_menus))\n", "side_menus[:5]" ] }, @@ -273,11 +454,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get elements with \"a.sidemenu\" CSS Selector.\n", "selected = soup.select(\"a.sidemenu\")\n", @@ -293,13 +485,34 @@ "Use BeautifulSoup to find all the `a` elements with class `mainmenu`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "En este caso busca la etiqueta `` que pertenecen a la clase `mainmenu` dentro del HTML con la sintaxis de selectores y presentaría los ultimos 5 elementos. Sin embargo, como no encuentra una clase mainmenu, no presenta data." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE\n" + "# Get elements with \"a.mainmenu\" CSS Selector.\n", + "selected = soup.select(\"a.mainmenu\")\n", + "selected[:5]\n" ] }, { @@ -316,22 +529,45 @@ "Getting the text inside an element is easy. All we have to do is use the `text` member of a `tag` object:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Intenta obtener todos los enlaces `` con la clase `sidemenu` y examinar el primero de ellos. También verifica el tipo de dato de la variable que contiene el primer enlace. \n", + "Sin embargo, como la lista no obtiene datos con la clase sidemenu, al intentar acceder a `side_menu_links[0]` se genera un `IndexError`. \n", + "Luego, nos arroja un NameError porque la variable `first_link` no está definida, ya que la lista `side_menu_links` estaba vacía en el paso anterior." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Get all sidemenu links as a list\n", - "side_menu_links = soup.select(\"a.sidemenu\")\n", - "\n", - "# Examine the first link\n", + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'soup' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Get all sidemenu links as a list\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m side_menu_links = \u001b[43msoup\u001b[49m.select(\u001b[33m\"\u001b[39m\u001b[33ma.sidemenu\u001b[39m\u001b[33m\"\u001b[39m) \n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# Examine the first link\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;28mprint\u001b[39m(side_menus)\n", + "\u001b[31mNameError\u001b[39m: name 'soup' is not defined" + ] + } + ], + "source": [ + " # Get all sidemenu links as a list\n", + "side_menu_links = soup.select(\"a.sidemenu\") \n", + "\n", + " # Examine the first link\n", + "print(side_menus)\n", "first_link = side_menu_links[0]\n", "print(first_link)\n", "\n", - "# What class is this variable?\n", + " # What class is this variable?\n", "print('Class: ', type(first_link))" ] }, @@ -344,11 +580,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'first_link' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mfirst_link\u001b[49m.text)\n", + "\u001b[31mNameError\u001b[39m: name 'first_link' is not defined" + ] + } + ], "source": [ "print(first_link.text)" ] @@ -364,11 +612,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'first_link' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mfirst_link\u001b[49m[\u001b[33m'\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m'\u001b[39m])\n", + "\u001b[31mNameError\u001b[39m: name 'first_link' is not defined" + ] + } + ], "source": [ "print(first_link['href'])" ] @@ -388,7 +648,10 @@ "metadata": {}, "outputs": [], "source": [ - "# YOUR CODE HERE\n" + "# YOUR CODE HERE\n", + "\n", + "\n", + "print(first_link['href'])" ] }, { @@ -417,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "tags": [] }, @@ -442,9 +705,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all table row elements\n", "rows = soup.find_all(\"tr\")\n", @@ -460,7 +734,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -480,9 +754,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[44]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m example_row = \u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(example_row.prettify())\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "example_row = rows[2]\n", "print(example_row.prettify())" @@ -501,9 +787,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'example_row' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[41]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m cell \u001b[38;5;129;01min\u001b[39;00m \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd\u001b[39m\u001b[33m'\u001b[39m):\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(cell)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m()\n", + "\u001b[31mNameError\u001b[39m: name 'example_row' is not defined" + ] + } + ], "source": [ "for cell in example_row.select('td'):\n", " print(cell)\n", @@ -527,11 +825,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'example_row' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd\u001b[39m\u001b[33m'\u001b[39m) == example_row.select(\u001b[33m'\u001b[39m\u001b[33m.detail\u001b[39m\u001b[33m'\u001b[39m) == example_row.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m)\n", + "\u001b[31mNameError\u001b[39m: name 'example_row' is not defined" + ] + } + ], "source": [ "assert example_row.select('td') == example_row.select('.detail') == example_row.select('td.detail')" ] @@ -545,9 +855,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'example_row' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[39]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Select only those 'td' tags with class 'detail' \u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m detail_cells = \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 3\u001b[39m detail_cells\n", + "\u001b[31mNameError\u001b[39m: name 'example_row' is not defined" + ] + } + ], "source": [ "# Select only those 'td' tags with class 'detail' \n", "detail_cells = example_row.select('td.detail')\n", @@ -563,9 +885,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'detail_cells' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[38]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Keep only the text in each of those cells\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m row_data = [cell.text \u001b[38;5;28;01mfor\u001b[39;00m cell \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdetail_cells\u001b[49m]\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data)\n", + "\u001b[31mNameError\u001b[39m: name 'detail_cells' is not defined" + ] + } + ], "source": [ "# Keep only the text in each of those cells\n", "row_data = [cell.text for cell in detail_cells]\n", @@ -582,9 +916,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'row_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[37]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mrow_data\u001b[49m[\u001b[32m0\u001b[39m]) \u001b[38;5;66;03m# Name\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data[\u001b[32m3\u001b[39m]) \u001b[38;5;66;03m# District\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data[\u001b[32m4\u001b[39m]) \u001b[38;5;66;03m# Party\u001b[39;00m\n", + "\u001b[31mNameError\u001b[39m: name 'row_data' is not defined" + ] + } + ], "source": [ "print(row_data[0]) # Name\n", "print(row_data[3]) # District\n", @@ -988,7 +1334,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1002,12 +1348,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "b6f9fe9f4b7182690503d8ecc2bae97b0ee3ebf54e877167ae4d28c119a56988" - } + "version": "3.13.7" } }, "nbformat": 4, From 86a76e90dc968f069267167cdb29456aef326c98 Mon Sep 17 00:00:00 2001 From: andrespalate12-eng Date: Tue, 26 Aug 2025 21:59:45 -0500 Subject: [PATCH 2/2] actualizacion final del web_scraping --- lessons/02_web_scraping.ipynb | 596 ++++++++++++++++++++++++++++------ 1 file changed, 497 insertions(+), 99 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 09599f8..18a8069 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -153,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": { "tags": [] }, @@ -212,14 +212,32 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n" + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " English\n", + " , \n", + " Afrikaans\n", + " , \n", + " Albanian\n", + " , \n", + " Arabic\n", + " , \n", + " Armenian\n", + " , \n", + " Azerbaijani\n", + " , \n", + " Basque\n", + " , \n", + " Bengali\n", + " , \n", + " Bosnian\n", + " , \n", + " Catalan\n", + " ]\n" ] } ], @@ -344,20 +400,21 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": { "tags": [] }, "outputs": [ { - "ename": "IndexError", - "evalue": "list index out of range", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m a_tags = soup.find_all(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 2\u001b[39m a_tags_alt = soup(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43ma_tags\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(a_tags_alt[\u001b[32m0\u001b[39m])\n", - "\u001b[31mIndexError\u001b[39m: list index out of range" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " English\n", + " \n", + "\n", + " English\n", + " \n" ] } ], @@ -377,14 +434,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0\n" + "270\n" ] } ], @@ -413,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": { "tags": [] }, @@ -431,7 +488,7 @@ "[]" ] }, - "execution_count": 20, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -454,7 +511,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "metadata": { "tags": [] }, @@ -465,7 +522,7 @@ "[]" ] }, - "execution_count": 18, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -495,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -504,15 +561,15 @@ "[]" ] }, - "execution_count": 24, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get elements with \"a.mainmenu\" CSS Selector.\n", - "selected = soup.select(\"a.mainmenu\")\n", - "selected[:5]\n" + "selected_main = soup.select(\"a.mainmenu\")\n", + "selected_main[:5]\n" ] }, { @@ -536,25 +593,25 @@ "## Explicación del bloque del código ##\n", "Intenta obtener todos los enlaces `` con la clase `sidemenu` y examinar el primero de ellos. También verifica el tipo de dato de la variable que contiene el primer enlace. \n", "Sin embargo, como la lista no obtiene datos con la clase sidemenu, al intentar acceder a `side_menu_links[0]` se genera un `IndexError`. \n", - "Luego, nos arroja un NameError porque la variable `first_link` no está definida, ya que la lista `side_menu_links` estaba vacía en el paso anterior." + "Luego, con los comando `first_link.text` y `first_link['href']` nos arroja un `NameError` porque la variable `first_link` no está definida, ya que la lista `side_menu_links` estaba vacía en el paso anterior." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": { "tags": [] }, "outputs": [ { - "ename": "NameError", - "evalue": "name 'soup' is not defined", + "ename": "IndexError", + "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Get all sidemenu links as a list\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m side_menu_links = \u001b[43msoup\u001b[49m.select(\u001b[33m\"\u001b[39m\u001b[33ma.sidemenu\u001b[39m\u001b[33m\"\u001b[39m) \n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# Examine the first link\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;28mprint\u001b[39m(side_menus)\n", - "\u001b[31mNameError\u001b[39m: name 'soup' is not defined" + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 2\u001b[39m side_menu_links = soup.select(\u001b[33m\"\u001b[39m\u001b[33ma.sidemenu\u001b[39m\u001b[33m\"\u001b[39m) \n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# Examine the first link\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m first_link = \u001b[43mside_menu_links\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[38;5;28mprint\u001b[39m(first_link)\n\u001b[32m 8\u001b[39m \u001b[38;5;66;03m# What class is this variable?\u001b[39;00m\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" ] } ], @@ -563,7 +620,6 @@ "side_menu_links = soup.select(\"a.sidemenu\") \n", "\n", " # Examine the first link\n", - "print(side_menus)\n", "first_link = side_menu_links[0]\n", "print(first_link)\n", "\n", @@ -580,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": { "tags": [] }, @@ -592,7 +648,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mfirst_link\u001b[49m.text)\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mfirst_link\u001b[49m.text)\n", "\u001b[31mNameError\u001b[39m: name 'first_link' is not defined" ] } @@ -612,7 +668,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 18, "metadata": { "tags": [] }, @@ -624,7 +680,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mfirst_link\u001b[49m[\u001b[33m'\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m'\u001b[39m])\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mfirst_link\u001b[49m[\u001b[33m'\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m'\u001b[39m])\n", "\u001b[31mNameError\u001b[39m: name 'first_link' is not defined" ] } @@ -642,16 +698,38 @@ "Extract all `href` attributes for each `mainmenu` URL." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se buscan todos los enlaces `` que pertenecen a la clase `mainmenu` y se guardan en la lista `main_menu_links`. Luego se intenta acceder al primer elemento de esa lista y mostrar su atributo `href`. \n", + "Como resultado se genera un `IndexError` porque la lista `main_menu_links` está vacía, es decir, no se encontraron enlaces con la clase `mainmenu` en el HTML descargado." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Get all mainmenu links as a list\u001b[39;00m\n\u001b[32m 2\u001b[39m main_menu_links = soup.select(\u001b[33m\"\u001b[39m\u001b[33ma.mainmenu\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m first_link_menu = \u001b[43mmain_menu_links\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(first_link_menu[\u001b[33m'\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m'\u001b[39m])\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ - "# YOUR CODE HERE\n", "\n", - "\n", - "print(first_link['href'])" + "# Get all mainmenu links as a list\n", + "main_menu_links = soup.select(\"a.mainmenu\")\n", + "first_link_menu = main_menu_links[0]\n", + "print(first_link_menu['href'])\n" ] }, { @@ -678,6 +756,14 @@ "Let's scrape and parse the webpage, using the tools we learned in the previous section." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se realiza una solicitud HTTP GET a la página con el parámetro `GA=98`, se obtiene el contenido de la respuesta y se convierte en un árbol HTML con BeautifulSoup utilizando el parser `lxml`." + ] + }, { "cell_type": "code", "execution_count": 16, @@ -703,6 +789,16 @@ "Our goal is to obtain the elements in the table on the webpage. Remember: rows are identified by the `tr` tag. Let's use `find_all` to obtain these elements." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Explicación del bloque del código ##\n", + "Se obtienen todas las filas de tabla (``) en el HTML usando dos métodos: `find_all(\"tr\")` y el selector CSS `'tr tr tr'`. Luego se imprimen las primeras 5 filas** encontradas. \n", + "Sin embargo, como no obtiene dada no tenemos resultados, asi mismo en la longitud con el comando len(rows) podemos verificar que aparece el valor de 0. Por ultimo, al tratar de acceder al tercer valor se genera un `IndexError` ya que la variable `rows` está vacía y no contiene ningún elemento." + ] + }, { "cell_type": "code", "execution_count": 17, @@ -734,7 +830,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -754,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -764,7 +860,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[44]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m example_row = \u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(example_row.prettify())\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[22]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m example_row = \u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(example_row.prettify())\n", "\u001b[31mIndexError\u001b[39m: list index out of range" ] } @@ -785,9 +881,24 @@ "* We could combine both and use the selector `td.detail`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se recorren los elementos de una fila de tabla (`example_row`) y se trata de imprimir las celdas que coinciden con distintos selectores: \n", + "- `td` selecciona todas las celdas de la fila. \n", + "- `.detail` selecciona los elementos que tienen la clase `detail`. \n", + "- `td.detail` selecciona específicamente las celdas `` con la clase `detail`. \n", + "\n", + "Finalmente, con `assert` se valida que los tres selectores devuelven el mismo resultado.\n", + "\n", + "Presentamos como resultado el error `NameError` ya que el `example_row` no logro ser definida por lo mencionado en el anterior bloque de codigo" + ] + }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -797,7 +908,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[41]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m cell \u001b[38;5;129;01min\u001b[39;00m \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd\u001b[39m\u001b[33m'\u001b[39m):\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(cell)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m()\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[23]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m cell \u001b[38;5;129;01min\u001b[39;00m \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd\u001b[39m\u001b[33m'\u001b[39m):\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(cell)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m()\n", "\u001b[31mNameError\u001b[39m: name 'example_row' is not defined" ] } @@ -825,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 24, "metadata": { "tags": [] }, @@ -837,7 +948,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd\u001b[39m\u001b[33m'\u001b[39m) == example_row.select(\u001b[33m'\u001b[39m\u001b[33m.detail\u001b[39m\u001b[33m'\u001b[39m) == example_row.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m)\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[24]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd\u001b[39m\u001b[33m'\u001b[39m) == example_row.select(\u001b[33m'\u001b[39m\u001b[33m.detail\u001b[39m\u001b[33m'\u001b[39m) == example_row.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m)\n", "\u001b[31mNameError\u001b[39m: name 'example_row' is not defined" ] } @@ -853,9 +964,19 @@ "Let's use the selector `td.detail` to be as specific as possible." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "En este bloque se busca extraer únicamente las celdas `` de una fila (`example_row`) que tengan la clase `detail`. \n", + "Después, se crea una lista row_data que contiene solo el texto de esas celdas y por ultimo, se pueda acceder a posiciones específicas para extraer valores concretos como Name, District y Party. \n", + "Como resultado se tienen error del tipo `NameError` por lo antes mencionado con la variable `example_row`." + ] + }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -865,7 +986,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[39]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Select only those 'td' tags with class 'detail' \u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m detail_cells = \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 3\u001b[39m detail_cells\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[25]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Select only those 'td' tags with class 'detail' \u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m detail_cells = \u001b[43mexample_row\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 3\u001b[39m detail_cells\n", "\u001b[31mNameError\u001b[39m: name 'example_row' is not defined" ] } @@ -885,7 +1006,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -895,7 +1016,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[38]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Keep only the text in each of those cells\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m row_data = [cell.text \u001b[38;5;28;01mfor\u001b[39;00m cell \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdetail_cells\u001b[49m]\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data)\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Keep only the text in each of those cells\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m row_data = [cell.text \u001b[38;5;28;01mfor\u001b[39;00m cell \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdetail_cells\u001b[49m]\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data)\n", "\u001b[31mNameError\u001b[39m: name 'detail_cells' is not defined" ] } @@ -916,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -926,7 +1047,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[37]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mrow_data\u001b[49m[\u001b[32m0\u001b[39m]) \u001b[38;5;66;03m# Name\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data[\u001b[32m3\u001b[39m]) \u001b[38;5;66;03m# District\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data[\u001b[32m4\u001b[39m]) \u001b[38;5;66;03m# Party\u001b[39;00m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mrow_data\u001b[49m[\u001b[32m0\u001b[39m]) \u001b[38;5;66;03m# Name\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data[\u001b[32m3\u001b[39m]) \u001b[38;5;66;03m# District\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(row_data[\u001b[32m4\u001b[39m]) \u001b[38;5;66;03m# Party\u001b[39;00m\n", "\u001b[31mNameError\u001b[39m: name 'row_data' is not defined" ] } @@ -946,11 +1067,34 @@ "We saw at the beginning that not all of the rows we got actually correspond to a senator. We'll need to do some cleaning before we can proceed forward. Take a look at some examples:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Este bloque intenta inspeccionar filas específicas de la lista `rows` y luego medir su tamaño con `len(rows[0])` para distinguir entre filas “malas” y “buenas”. \n", + "\n", + "Sin embargo, se produce un `IndexError`al acceder a `rows[0]` porque `rows` está vacía (no hay ningún elemento en esa posición). Por el mismo motivo, la llamada `len(rows[0])` vuelve a fallar e intenta obtener la longitud de un elemento que no existe.\n", + "\n" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[29]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mRow 0:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m, \u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m, \u001b[33m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mRow 1:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m, rows[\u001b[32m1\u001b[39m], \u001b[33m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mLast Row:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m, rows[-\u001b[32m1\u001b[39m])\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "print('Row 0:\\n', rows[0], '\\n')\n", "print('Row 1:\\n', rows[1], '\\n')\n", @@ -968,9 +1112,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Bad rows\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(\u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m))\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(rows[\u001b[32m1\u001b[39m]))\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# Good rows\u001b[39;00m\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "# Bad rows\n", "print(len(rows[0]))\n", @@ -988,11 +1144,37 @@ "Perhaps good rows have a length of 5. Let's check:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "\n", + "En este bloque se intenta filtrar las filas \"buenas\" de la lista `rows`. \n", + "Primero se crea la lista `good_rows`, que guarda únicamente aquellas filas cuyo tamaño (`len(row)`) es igual a 5. \n", + "\n", + "Posteriormente, se intenta imprimir algunos elementos de `good_rows`. Sin embargo, se genera un `IndexError` al tratar de imprimir el primer elemento, es decir `good_rows[0]`, ya que la lista `good_rows` está vacía.\n", + "\n", + "De manera similar, al intentar acceder a `rows[2].select('td.detail')` o a `rows[-1]`, aparece el mismo error `IndexError`: la lista `rows` está vacía y no existe ningún elemento en esas posiciones. \n" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[31]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m good_rows = [row \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m rows \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(row) == \u001b[32m5\u001b[39m]\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# Let's check some rows\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mgood_rows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m, \u001b[33m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m 5\u001b[39m \u001b[38;5;28mprint\u001b[39m(good_rows[-\u001b[32m2\u001b[39m], \u001b[33m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m 6\u001b[39m \u001b[38;5;28mprint\u001b[39m(good_rows[-\u001b[32m1\u001b[39m])\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "good_rows = [row for row in rows if len(row) == 5]\n", "\n", @@ -1011,18 +1193,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[32]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m) \n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "rows[2].select('td.detail') " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[33]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Bad row\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mrows\u001b[49m\u001b[43m[\u001b[49m\u001b[43m-\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m]\u001b[49m.select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m), \u001b[33m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# Good row\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;28mprint\u001b[39m(rows[\u001b[32m5\u001b[39m].select(\u001b[33m'\u001b[39m\u001b[33mtd.detail\u001b[39m\u001b[33m'\u001b[39m), \u001b[33m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m)\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "# Bad row\n", "print(rows[-1].select('td.detail'), '\\n')\n", @@ -1054,9 +1260,17 @@ "Now that we've seen how to get the data we want from one row, as well as filter out the rows we don't want, let's put it all together into a loop." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "En este bloque se busca procesar las filas válidas (`valid_rows`) de la lista `rows` y almacenar información de cada senador en una lista llamada `members` y al final mostrar los primeros 5 registros. Como la lista `rows` esta vacia desde el inicio, `valid_rows` también estará vacía y, en consecuencia, `members` quedará como una lista vacía []" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "tags": [] }, @@ -1086,9 +1300,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Should be 61\n", "len(members)" @@ -1103,9 +1328,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "print(members[:5])" ] @@ -1147,9 +1380,22 @@ "The code has been partially filled out for you. Fill it in where it says `#YOUR CODE HERE`. Save the path into an object called `full_path`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se hace una petición HTTP a la página del Senado y se analiza el contenido con `BeautifulSoup` usando el parser lxml.\n", + "Se seleccionan todas las filas `` y se filtran solo aquellas que contienen celdas con clase `td.detail`, que corresponden a los datos de los senadores. \n", + "\n", + "Posteriormente, se recorre las filas y extrae información, para cada fila válida se obtiene los datos: Nombre del senador, Número de distrito, Partido político, Enlace a los proyectos de ley (Bills), construyendo la URL completa a partir del atributo `href`.\n", + "\n", + "Toda la información se guarda en una lista members como tuplas (name, district, party, full_path). Finalmente se pueden inspeccionar los primeros cinco elementos para verificar los datos." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": { "tags": [] }, @@ -1163,9 +1409,8 @@ "soup = BeautifulSoup(src, \"lxml\")\n", "# Create empty list to store our data\n", "members = []\n", - "\n", "# Returns every ‘tr tr tr’ css selector in the page\n", - "rows = soup.select('tr tr tr')\n", + "rows = soup.select('tr')\n", "# Get rid of junk rows\n", "rows = [row for row in rows if row.select('td.detail')]\n", "\n", @@ -1174,14 +1419,21 @@ " # Select only those 'td' tags with class 'detail'\n", " detail_cells = row.select('td.detail') \n", " # Keep only the text in each of those cells\n", - " row_data = [cell.text for cell in detail_cells]\n", + " row_data = [cell.text.strip() for cell in detail_cells] # strip() limpia espacios extra\n", + " #row_data = [cell.text for cell in detail_cells]\n", " # Collect information\n", " name = row_data[0]\n", " district = int(row_data[3])\n", " party = row_data[4]\n", - "\n", " # YOUR CODE HERE\n", - " full_path = ''\n", + " # Buscar el enlace hacia \"Bills\"\n", + " bill_link = row.select_one('a[href*=\"SenatorBills.asp\"]')\n", + " if bill_link:\n", + " relative_path = bill_link['href'] # extrae el atributo href\n", + " full_path = \"http://www.ilga.gov\" + relative_path # construye el enlace completo\n", + " else:\n", + " full_path = None # en caso de que no exista enlace\n", + "\n", "\n", " # Store in a tuple\n", " senator = (name, district, party, full_path)\n", @@ -1191,14 +1443,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Uncomment to test \n", - "# members[:5]" + "members[:5]" ] }, { @@ -1210,9 +1473,22 @@ "Turn the code above into a function that accepts a URL, scrapes the URL for its senators, and returns a list of tuples containing information about each senator. " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Este bloque define una función modular llamada get_members que realiza scraping de una página del Senado de Illinois y devuelve la información de cada senador en forma de tuplas.\n", + "La función acepta cualquier URL de la página del Senado que siga la misma estructura de tablas.\n", + "Se hace una petición HTTP a la URL y se analiza el contenido con BeautifulSoup usando el parser lxml. Se seleccionan todas las filas `` y se filtran solo las que contienen celdas con clase `td.detail`, que corresponden a los datos de los senadores. \n", + "\n", + "Extrae la información de cada senador como en el anterior ejercicio, construyendo la URL completa a partir del atributo `href`. Cada senador se almacena como una tupla (nombre, distrito, partido, full_path) y se agrega a la lista members. \n", + "Al final, la función devuelve la lista con todos los senadores extraídos." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": { "tags": [] }, @@ -1220,16 +1496,56 @@ "source": [ "# YOUR CODE HERE\n", "def get_members(url):\n", - " return [___]\n" + " # Hacer la petición HTTP\n", + " req = requests.get(url)\n", + " src = req.text\n", + " soup = BeautifulSoup(src, \"lxml\")\n", + "\n", + " members = []\n", + "\n", + " # Seleccionar todas las filas que contengan celdas con clase 'detail'\n", + " rows = [row for row in soup.select('tr') if row.select('td.detail')]\n", + "\n", + " # Recorrer las filas válidas\n", + " for row in rows:\n", + " detail_cells = row.select('td.detail')\n", + " row_data = [cell.text.strip() for cell in detail_cells]\n", + "\n", + " name = row_data[0]\n", + " district = int(row_data[3])\n", + " party = row_data[4]\n", + "\n", + " # Obtener el enlace a los Bills\n", + " bill_link = row.select_one('a[href*=\"SenatorBills.asp\"]')\n", + " if bill_link:\n", + " full_path = \"http://www.ilga.gov\" + bill_link['href']\n", + " else:\n", + " full_path = None\n", + "\n", + " # Guardar la información en la lista\n", + " members.append((name, district, party, full_path))\n", + "\n", + " return members\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Test your code\n", "url = 'http://www.ilga.gov/senate/default.asp?GA=98'\n", @@ -1258,9 +1574,18 @@ "This function has been partially completed. Fill in the rest." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "En el codigo agregado se selecciona todas las filas `` y dentro de ellas solo las celdas `` con clase `billlist`, que contienen los datos de cada bill. \n", + "Extrae los distintos datos para almacenar cada bill como una tupla (bill_id, description, chamber, last_action, last_action_date) y la agrega a la lista `bills`. Como en la variable `senate_members` no tenia data, en este caso tendre un error del tipo `IndexError` ya que trata de obtener un elemento en una posición vacia." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": { "tags": [] }, @@ -1273,11 +1598,15 @@ " bills = []\n", " for row in rows:\n", " # YOUR CODE HERE\n", - " bill_id =\n", - " description =\n", - " chamber =\n", - " last_action =\n", - " last_action_date =\n", + " cells = row.select('td.billlist') # Solo las celdas con clase billlist\n", + " if len(cells) < 5:\n", + " continue # Ignorar filas incompletas\n", + "\n", + " bill_id = cells[0].text.strip() # 1ra columna\n", + " description = cells[1].text.strip() # 2da columna\n", + " chamber = cells[2].text.strip() # 3ra columna\n", + " last_action = cells[3].text.strip() # 4ta columna\n", + " last_action_date = cells[4].text.strip() # 5ta columna\n", " bill = (bill_id, description, chamber, last_action, last_action_date)\n", " bills.append(bill)\n", " return bills" @@ -1289,11 +1618,23 @@ "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[72]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Uncomment to test your code\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m test_url = \u001b[43msenate_members\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 3\u001b[39m get_bills(test_url)[\u001b[32m0\u001b[39m:\u001b[32m5\u001b[39m]\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "# Uncomment to test your code\n", - "# test_url = senate_members[0][3]\n", - "# get_bills(test_url)[0:5]" + "test_url = senate_members[0][3]\n", + "get_bills(test_url)[0:5]" ] }, { @@ -1307,27 +1648,84 @@ "**NOTE:** please call the function `time.sleep(1)` for each iteration of the loop, so that we don't destroy the state's web site." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicación del bloque del código ##\n", + "Se crea un diccionario vacío `bills_dict` que almacenará como clave el número de distrito y como valor la lista de `bills` correspondientes a ese distrito. Luego se itera sobre cada senador en `members_dict`.\n", + "Para cada senador, se obtiene la URL de sus bills y se llama a la función `get_bills(bill_url)`.\n", + "Los resultados se almacenan en `bills_dict` bajo la clave correspondiente al distrito del senador.\n", + "Se incluye `time.sleep(1)` en cada iteración para evitar saturar el sitio web. \n", + "\n", + "En este caso, como los datos de senadores no se extrajeron correctamente en pasos previos, `members_dict` está vacío. Por ello, bills_dict también resulta vacío y al intentar acceder a una clave específica `como bills_dict[52]` genera un `KeyError`." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "[]\n" + ] + } + ], "source": [ - "# YOUR CODE HERE\n" + "# YOUR CODE HERE\n", + "# Crear un diccionario para almacenar los bills por distrito\n", + "bills_dict = {}\n", + "# Crear un diccionario a partir de la lista de miembros\n", + "# La clave será el distrito, el valor la tupla completa de información del senador\n", + "members_dict = {member[1]: member for member in senate_members}\n", + "\n", + "# Verificar el diccionario\n", + "print(list(members_dict.keys())) # Primeros los distritos\n", + "print(list(members_dict.values())) # Primeros los senadores\n", + "# Recorrer todos los senadores en members_dict\n", + "for district, member_info in members_dict.items():\n", + " bill_url = member_info[3] # Tomar la URL de Bills de cada senador\n", + " if bill_url: # Verificar que exista URL\n", + " bills = get_bills(bill_url) # Obtener los bills de ese senador\n", + " bills_dict[district] = bills\n", + " else:\n", + " bills_dict[district] = [] # Si no hay URL, dejar lista vacía\n", + "\n", + " time.sleep(1) # Pausa de 1 segundo para no saturar el sitio\n", + "\n", + "# Verificar algunos resultados\n", + "for district in list(bills_dict.keys())[:5]:\n", + " print(f\"Distrito {district}: {len(bills_dict[district])} bills\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "KeyError", + "evalue": "52", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[82]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Uncomment to test your code\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mbills_dict\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m52\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[31mKeyError\u001b[39m: 52" + ] + } + ], "source": [ "# Uncomment to test your code\n", - "# bills_dict[52]" + "bills_dict[52]" ] } ],