diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml new file mode 100644 index 000000000..c07e5852e --- /dev/null +++ b/.github/workflows/scrape.yml @@ -0,0 +1,28 @@ +on: + schedule: + # Run every day at 3am ET (7am UTC) + # See: https://crontab.guru/#0_7_*_*_* + - cron: "0 7 * * *" + push: + + +jobs: + scrape: + runs-on: ubuntu-latest + # Only run when scheduled, or if push has "[force ci]" in its last commit message. + if: "github.event_name == 'schedule' || contains(github.event.head_commit.message, '[force ci]')" + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v3 + with: + python-version: '3.10' + + - run: pip install --user pipenv + + - run: make run-code + + - uses: actions/upload-artifact@v3 + with: + name: url-list + path: list.csv diff --git a/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb b/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb index d11dd8651..eea65226f 100644 --- a/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb +++ b/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "cf682cb3", "metadata": {}, "outputs": [], @@ -43,7 +43,25 @@ "from selenium import webdriver\n", "from selenium.webdriver.support.ui import Select\n", "from bs4 import BeautifulSoup as bs\n", - "import requests" + "import requests\n", + "import json\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554bc853", + "metadata": {}, + "outputs": [], + "source": [ + "from webdriver_manager.chrome import ChromeDriverManager\n", + "from selenium.webdriver.chrome.service import Service\n", + "from selenium.webdriver.chrome.options import Options\n", + "\n", + "# Allow Chrome to run in headless mode (required for running in GitHub Actions)\n", + "options = Options()\n", + "options.headless = True" ] }, { @@ -55,11 +73,10 @@ }, "outputs": [], "source": [ - "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n", - "driver=r'C:\\Users\\Neptune\\Downloads\\chromedriver'\n", + "# Open Chrome browser. (get chrome driver if it isn't installed already)\n", + "browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)\n", "\n", - "# using the selenium web driver I downloaded\n", - "browser = webdriver.Chrome(driver)\n", + "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n", "browser.get(link)\n", "time.sleep(2)\n", "\n", @@ -75,7 +92,7 @@ "except:\n", " print('could not find Search')\n", "\n", - "time.sleep(5)\n", + "time.sleep(3)\n", "\n", "not_last = True\n", "# to store all the URLs\n", @@ -107,7 +124,7 @@ " next_page = browser.find_element_by_xpath('/html/body/main/form/div[33]/ul/li[83]/a')\n", " print('Found next button to press.')\n", " next_page.click()\n", - " time.sleep(4) # wait for next page to load\n", + " time.sleep(2) # wait for next page to load\n", " except:\n", " # should not have a next button on the last page\n", " print('last page or no next button found!')\n", @@ -116,7 +133,12 @@ "# the final list\n", "print(\"================================== END ==================================\")\n", "print(len(URLs[:30])) # show a section \n", - "time.sleep(5)\n", + "time.sleep(2)\n", + "\n", + "# write progress to csv\n", + "df = pd.DataFrame(URLs, columns=[\"URL\"])\n", + "df.to_csv('list.csv', index=False)\n", + "\n", "browser.quit()" ] }, @@ -128,6 +150,14 @@ "#### Collect all the data from all the detailed case pages" ] }, + { + "cell_type": "markdown", + "id": "77947f1a", + "metadata": {}, + "source": [ + "This part of the code adapted from the CBC script" + ] + }, { "cell_type": "code", "execution_count": null, @@ -135,13 +165,377 @@ "metadata": {}, "outputs": [], "source": [ - "# loop through all the URLs and collect the data from each page\n", - "for url in URLs:\n", - " persons_page_url = 'https://www.services.rcmp-grc.gc.ca' + url" + "base_url = r'https://www.services.rcmp-grc.gc.ca'\n", + "\n", + "#CLEANING FUNCTION\n", + "def cleaning_function(item):\n", + " item = str(item)\n", + " item = item.replace(\"
\" , \"\")\n", + " item = item.replace(\"
\" , \"\")\n", + " item = item.replace(\"

\" , \"\")\n", + " item = item.replace(\"

\" , \"\")\n", + " item = item.replace(\"Missing from \" , \"\")\n", + " item = item.replace(\"\" , \"\")\n", + " item = item.replace(\"\" , \"\")\n", + " return item" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d72d3b0e", + "metadata": {}, + "outputs": [], + "source": [ + "#this is the list where all the URLs from the sheet will go\n", + "person_url_list = []\n", + "#this is where all the person info will go\n", + "person_info = []\n", + "\n", + "#a list for the sections later\n", + "section_list = []\n", + "\n", + "#I have this because I dont know how else to filter out stuff from an if statement that I dont want\n", + "count_working = 0\n", + "\n", + "for page_url in URLs:\n", + " print(\"Record Number: \" + str(count_working))\n", + " print(\"Case URL: \" + page_url)\n", + " count_working += 1\n", + " url = base_url + page_url\n", + " \n", + " # request the html\n", + " try:\n", + " page = requests.get(url, timeout = 10)\n", + " except requests.exceptions.Timeout:\n", + " print(\"Timeout occurred\")\n", + " # structure the page content for parsing\n", + " soup = bs(page.content, 'html.parser') \n", + " \n", + " # First we have to pull out the content area\n", + " content_area = soup.find('main' , {\"property\" : \"mainContentOfPage\"})\n", + " \n", + " # LOCATION\n", + " try:\n", + " location_scrape = content_area.find('div')\n", + " location_isolate = location_scrape.find_all('p')\n", + " location_string = str(location_isolate[2])\n", + " location_split = location_string.split(\",\")\n", + " province = cleaning_function(location_split[1])\n", + " city = cleaning_function(location_split[0])\n", + " except:\n", + " print('No Location')\n", + " \n", + " # STATUS\n", + " status_scrape = content_area.find_all('h2')\n", + " status = status_scrape[:1]\n", + " status = str(status)\n", + " front_of_status = status.index('

') + 4\n", + " back_of_status = status.index('

')\n", + " status_cleaned = (f'{status[front_of_status : back_of_status]}')\n", + " \n", + " # FOR THE MISSING ENTRIES\n", + " if 'Missing' in status_cleaned:\n", + " #Now we get into pulling out individual details which will eventually be compiled in a list\n", + " #NAME(MISSING)\n", + " name_scrape = content_area.find_all('h3')\n", + " person_name = name_scrape[:1]\n", + " person_name = str(person_name)\n", + " front_of_name = person_name.index('

') + 4\n", + " back_of_name = person_name.index('

')\n", + " name_cleaned = (f'{person_name[front_of_name : back_of_name]}')\n", + " name_split = name_cleaned.split(',')\n", + " last_name = name_split[0]\n", + " first_name = name_split[1]\n", + " first_name_string = str(name_split[1:2])\n", + " first_name_string = first_name_string.replace('[',\"\")\n", + " first_name_string = first_name_string.replace(']',\"\")\n", + " first_name_string = first_name_string.replace(\"'\",\"\")\n", + " first_name_string = first_name_string.replace(\"\\n\",\"\")\n", + " first_name_string = first_name_string.strip()\n", + " \n", + " #PERSON DETAILS(MISSING)\n", + " try:\n", + " person_details = content_area.find_all('dd')\n", + " date_missing_discovered = person_details[0]\n", + " year_born = person_details[1]\n", + " age_at_disappearance = person_details[2]\n", + " gender = person_details[3]\n", + " bio_group = person_details[4]\n", + " except:\n", + " print('Data error')\n", + "\n", + " #FOR THE UNIDENTIFIED ENTRIES\n", + " else:\n", + " try:\n", + " first_name_string = 'Unidentified'\n", + " last_name = 'Unidentified'\n", + " person_details = content_area.find_all('dd')\n", + " date_missing_discovered = person_details[0]\n", + " age_at_disappearance = person_details[1]\n", + " gender = person_details[2]\n", + " bio_group = person_details[3]\n", + " year_born = 'Unknown'\n", + " except:\n", + " print('Data error2')\n", + " \n", + " \n", + " #PUT IT ALL TOGETHER\n", + " person_info.append([first_name_string , last_name , status_cleaned , cleaning_function(date_missing_discovered) , cleaning_function(year_born) , cleaning_function(age_at_disappearance) , cleaning_function(gender) , cleaning_function(bio_group) , city , province , url])\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "30062b69", + "metadata": {}, + "source": [ + "### Save the file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b297ad3", + "metadata": {}, + "outputs": [], + "source": [ + "# print to a file\n", + "full_file = pd.DataFrame(person_info)\n", + "full_file.to_csv(\"output_rcmp.csv\")\n", + "print('Done')" + ] + }, + { + "cell_type": "markdown", + "id": "72e7fbf7", + "metadata": {}, + "source": [ + "### To Avoid Running the URL Collector Again - Run Code Below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e5e73d5", + "metadata": {}, + "outputs": [], + "source": [ + "with open('list.csv') as f:\n", + " allLines = f.readlines()\n", + " TempURLs = list(allLines)\n", + " # remove the column header\n", + " TempURLs = TempURLs[1:]\n", + " f.close()\n", + "\n", + "# clean the elements \n", + "URLs = []\n", + "for link in TempURLs:\n", + " URLs.append(link.strip())\n", + " \n", + "print(URLs[:10])" + ] + }, + { + "cell_type": "markdown", + "id": "32bc0a32", + "metadata": {}, + "source": [ + "#### Function to Turn DL sections into dict - No Longer Used!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40cad4bf", + "metadata": {}, + "outputs": [], + "source": [ + "def create_dl_dict(soup):\n", + " keys, values = [] , []\n", + " for dl in soup.find_all(\"dl\", {\"class\":\"dl-horizontal\"}):\n", + " for dt in dl.find_all(\"dt\"):\n", + " keys.append(dt.text.strip())\n", + " for dd in dl.find_all(\"dd\"):\n", + " values.append(dd.text.strip())\n", + " \n", + " return dict(zip(keys,values))" + ] + }, + { + "cell_type": "markdown", + "id": "5513dd44", + "metadata": {}, + "source": [ + "### Second Method - For More Structured Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f01a313d", + "metadata": {}, + "outputs": [], + "source": [ + "#I have this because I dont know how else to filter out stuff from an if statement that I dont want\n", + "count_working = 0\n", + "\n", + "# complete file\n", + "complete_db = {}\n", + "\n", + "# used to test\n", + "test_URLs = URLs[:2]\n", + "\n", + "# loop through all the URLs\n", + "for page_url in URLs:\n", + " \n", + " # page dict\n", + " page_dict = {}\n", + " #this is where all the person info will go\n", + " page_sections = []\n", + " # make the full URL\n", + " url = base_url + page_url\n", + " \n", + " print('==============================================')\n", + " print(\"Record Number: \" + str(count_working))\n", + " print(\"Case URL: \" + url)\n", + " \n", + " # next record\n", + " count_working += 1\n", + " \n", + " # request the html\n", + " try:\n", + " page = requests.get(url, timeout = 10)\n", + " except requests.exceptions.Timeout:\n", + " print(\"Timeout occurred\")\n", + " \n", + " # structure the page content for parsing\n", + " soup = bs(page.content, 'html.parser') \n", + " \n", + " #print(soup)\n", + " \n", + " # First we have to pull out the content area\n", + " content_area = soup.find('main' , {\"property\" : \"mainContentOfPage\"})\n", + " \n", + " try:\n", + " # the case reference number\n", + " _case_ref = content_area.find('h1')\n", + " page_dict['CaseRef'] = \" \".join(_case_ref.text.split())\n", + " \n", + " # the main section\n", + " sections = content_area.section\n", + " \n", + " # the description\n", + " desc = sections.div.p\n", + " page_dict['CaseDesc'] = desc.text.strip()\n", + " \n", + " # the category\n", + " case_type = sections.h2\n", + " page_dict['CaseType'] = \" \".join(case_type.text.split())\n", + " except:\n", + " print('page base info collection error')\n", + " \n", + " page_dict[\"CaseURL\"] = url\n", + " \n", + " \n", + " # find all the images in the persons section\n", + " try:\n", + " # the image link\n", + " images = sections.find_all('img')\n", + " imgs_list = []\n", + " for image in images:\n", + " image_src = image['src']\n", + " # check if this matches the no photo image\n", + " no_photo = re.search(\"noPhoto\\.png\", image_src)\n", + " if not no_photo:\n", + " # find the iamge ID\n", + " img_id = re.search(\"id=(\\d+).*\", image_src)\n", + " imgs_list.append(\"https://www.services.rcmp-grc.gc.ca/missing-disparus/showImage?\"+img_id.group())\n", + " # add the images section \n", + " # add to the main dict\n", + " page_dict['PageImages'] = imgs_list\n", + " except:\n", + " print(\"no images found\")\n", + " \n", + " \"\"\"\n", + " # if we need to treat the page types differently\n", + " if page_dict['CaseType'] == 'Missing':\n", + " \"\"\"\n", + " \n", + " # get the first section with all the persons\n", + " persons_section = sections.section\n", + " \n", + " # how many people are we looking through\n", + " persons_names = persons_section.find_all('h3')\n", + " num_persons = len(persons_names)\n", + " # all the blocks within the section\n", + " persons_blocks = persons_section.find_all('div',{\"class\":\"row\"})\n", + " \n", + " # loop through all the person sections to collect their data\n", + " # assigned to their names\n", + " for i in range(num_persons):\n", + " print(\"Person(s) in Case: \"+str(i+1))\n", + " block = {} # stores the individuals info, some pages have 1+\n", + " block['Name'] = \" \".join(persons_names[i].text.split())\n", + " \n", + " # select the current persion\n", + " current_person = persons_blocks[i]\n", + " \n", + " # takes all the DL sections out and saves them\n", + " dl_sections = []\n", + " for dl in current_person.find_all(\"dl\"):\n", + " dl_sections.append(str(dl))\n", + " block[\"InfoSection\"] = dl_sections \n", + " # add the block to the page sections\n", + " page_sections.append(block)\n", + " print(block['Name'])\n", + " #print(persons_blocks[i])\n", + " #print(block)\n", + " \n", + " \"\"\"\n", + " # If this is an unidentified persons record\n", + " else:\n", + " print(\"Un IDs Body\")\n", + " \n", + " \"\"\"\n", + " \n", + " # write the section to the dict\n", + " page_dict['PersonsData'] = page_sections\n", + " # write it all to the main DB\n", + " complete_db[page_dict['CaseRef']] = page_dict\n", + " \n", + "# write JSON to a file \n", + "with open(\"Complete_DB.json\", \"w\") as outfile:\n", + " json.dump(complete_db, outfile)\n", + " \n", + "print('======================= Done =======================')\n" + ] + }, + { + "cell_type": "markdown", + "id": "6123cd3b", + "metadata": {}, + "source": [ + "## Write to a file if it was not done above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41cf2eed", + "metadata": {}, + "outputs": [], + "source": [ + "# write JSON to a file \n", + "with open(\"Complete_DB.json\", \"w\") as outfile:\n", + " json.dump(complete_db, outfile)" ] } ], "metadata": { + "interpreter": { + "hash": "2516770f1231e9470eebfb2e8e89faf6b4b2c173f0d9550afe423a3c1e5f866c" + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -157,7 +551,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.0" } }, "nbformat": 4, diff --git a/Makefile b/Makefile index bbfa07f2f..e943c1faf 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,12 @@ -setup: ## Install packages via pipenv +setup: pipenv install -open: ## Start the Jupyter notebook +open-viewer: setup ## Open the notebook in an interactive viewer pipenv run jupyter notebook MissingPersons_DataExtract_Tom_May10.ipynb --config jupyter_notebook_config.py +run-code: setup ## Run the notebook's code in terminal (without viewer) + pipenv run jupyter nbconvert --to notebook --execute MissingPersons_DataExtract_Tom_May10.ipynb + %: @true diff --git a/MissingPersons_DataExtract_Tom_May10.ipynb b/MissingPersons_DataExtract_Tom_May10.ipynb index 3f4355851..a57d81494 100644 --- a/MissingPersons_DataExtract_Tom_May10.ipynb +++ b/MissingPersons_DataExtract_Tom_May10.ipynb @@ -46,7 +46,6 @@ "import requests\n", "import json\n", "import csv\n", - "from webdriver_manager.chrome import ChromeDriverManager\n", "from selenium.webdriver.common.by import By" ] }, @@ -57,9 +56,13 @@ "metadata": {}, "outputs": [], "source": [ - "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n", - "# Get a chrome driver if there isn't one locally\n", - "browser = webdriver.Chrome(ChromeDriverManager().install()) " + "from webdriver_manager.chrome import ChromeDriverManager\n", + "from selenium.webdriver.chrome.service import Service\n", + "from selenium.webdriver.chrome.options import Options\n", + "\n", + "# Allow Chrome to run in headless mode (required for running in GitHub Actions)\n", + "options = Options()\n", + "options.headless = True" ] }, { @@ -71,6 +74,10 @@ }, "outputs": [], "source": [ + "# Open Chrome browser. (get chrome driver if it isn't installed already)\n", + "browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)\n", + "\n", + "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n", "browser.get(link)\n", "time.sleep(2)\n", "\n", diff --git a/README.md b/README.md index 96a9eb920..ec8a14532 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,37 @@ Recommendations: - e.g., `brew install pyenv` - then `pyenv install --skip-existing` within this project, to install a known-good version of Python. +To list all available commands, run `make` (without any arguments): + ``` -make setup -make install +$ make +Usage: make + +where is one of the following: + +open-viewer Open the notebook in an interactive viewer +run-code Run the notebook's code in terminal (without viewer) ``` Note: If you don't wish to use `pipenv`, just open up `Makefile` and run its commands directly. +## Scheduled Cloud Run + +This scraper is intended to be run nightly in a cloud environment. We use GitHub Actions for this. + +Configuration: [`.github/workflows/scrape.yml`](/.github/workflows/scrape.yml) +Run History: [GitHub Action runs](https://github.com/CivicTechTO/missing-persons/actions/workflows/scrape.yml) + +Each successful script run in the cloud environment will save a zip file of the +generated files. You can find this on the "Summary" page of any workflow run. + +For example: +https://github.com/CivicTechTO/missing-persons/actions/runs/2382053305 + +While this workflow will run automatically each night, you can force a run +anytime. To do this, push to any branch a commit with `[force ci]` in the most +recent commit message. + ### 1. Web Scraping - Setting up Jupyter Notebooks & Selenium The required packages can be found in requirments.txt, bs4 and selenium are the main non-standard packages needed.