diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml
new file mode 100644
index 000000000..c07e5852e
--- /dev/null
+++ b/.github/workflows/scrape.yml
@@ -0,0 +1,28 @@
+on:
+ schedule:
+ # Run every day at 3am ET (7am UTC)
+ # See: https://crontab.guru/#0_7_*_*_*
+ - cron: "0 7 * * *"
+ push:
+
+
+jobs:
+ scrape:
+ runs-on: ubuntu-latest
+ # Only run when scheduled, or if push has "[force ci]" in its last commit message.
+ if: "github.event_name == 'schedule' || contains(github.event.head_commit.message, '[force ci]')"
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: actions/setup-python@v3
+ with:
+ python-version: '3.10'
+
+ - run: pip install --user pipenv
+
+ - run: make run-code
+
+ - uses: actions/upload-artifact@v3
+ with:
+ name: url-list
+ path: list.csv
diff --git a/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb b/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb
index d11dd8651..eea65226f 100644
--- a/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb
+++ b/.ipynb_checkpoints/MissingPersons_DataExtract_Tom_May10-checkpoint.ipynb
@@ -32,7 +32,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "cf682cb3",
"metadata": {},
"outputs": [],
@@ -43,7 +43,25 @@
"from selenium import webdriver\n",
"from selenium.webdriver.support.ui import Select\n",
"from bs4 import BeautifulSoup as bs\n",
- "import requests"
+ "import requests\n",
+ "import json\n",
+ "import csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "554bc853",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from webdriver_manager.chrome import ChromeDriverManager\n",
+ "from selenium.webdriver.chrome.service import Service\n",
+ "from selenium.webdriver.chrome.options import Options\n",
+ "\n",
+ "# Allow Chrome to run in headless mode (required for running in GitHub Actions)\n",
+ "options = Options()\n",
+ "options.headless = True"
]
},
{
@@ -55,11 +73,10 @@
},
"outputs": [],
"source": [
- "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n",
- "driver=r'C:\\Users\\Neptune\\Downloads\\chromedriver'\n",
+ "# Open Chrome browser. (get chrome driver if it isn't installed already)\n",
+ "browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)\n",
"\n",
- "# using the selenium web driver I downloaded\n",
- "browser = webdriver.Chrome(driver)\n",
+ "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n",
"browser.get(link)\n",
"time.sleep(2)\n",
"\n",
@@ -75,7 +92,7 @@
"except:\n",
" print('could not find Search')\n",
"\n",
- "time.sleep(5)\n",
+ "time.sleep(3)\n",
"\n",
"not_last = True\n",
"# to store all the URLs\n",
@@ -107,7 +124,7 @@
" next_page = browser.find_element_by_xpath('/html/body/main/form/div[33]/ul/li[83]/a')\n",
" print('Found next button to press.')\n",
" next_page.click()\n",
- " time.sleep(4) # wait for next page to load\n",
+ " time.sleep(2) # wait for next page to load\n",
" except:\n",
" # should not have a next button on the last page\n",
" print('last page or no next button found!')\n",
@@ -116,7 +133,12 @@
"# the final list\n",
"print(\"================================== END ==================================\")\n",
"print(len(URLs[:30])) # show a section \n",
- "time.sleep(5)\n",
+ "time.sleep(2)\n",
+ "\n",
+ "# write progress to csv\n",
+ "df = pd.DataFrame(URLs, columns=[\"URL\"])\n",
+ "df.to_csv('list.csv', index=False)\n",
+ "\n",
"browser.quit()"
]
},
@@ -128,6 +150,14 @@
"#### Collect all the data from all the detailed case pages"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "77947f1a",
+ "metadata": {},
+ "source": [
+ "This part of the code adapted from the CBC script"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -135,13 +165,377 @@
"metadata": {},
"outputs": [],
"source": [
- "# loop through all the URLs and collect the data from each page\n",
- "for url in URLs:\n",
- " persons_page_url = 'https://www.services.rcmp-grc.gc.ca' + url"
+ "base_url = r'https://www.services.rcmp-grc.gc.ca'\n",
+ "\n",
+ "#CLEANING FUNCTION\n",
+ "def cleaning_function(item):\n",
+ " item = str(item)\n",
+ " item = item.replace(\"
\" , \"\")\n",
+ " item = item.replace(\"\" , \"\")\n",
+ " item = item.replace(\"\" , \"\")\n",
+ " item = item.replace(\"
\" , \"\")\n",
+ " item = item.replace(\"Missing from \" , \"\")\n",
+ " item = item.replace(\"\" , \"\")\n",
+ " item = item.replace(\"\" , \"\")\n",
+ " return item"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d72d3b0e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#this is the list where all the URLs from the sheet will go\n",
+ "person_url_list = []\n",
+ "#this is where all the person info will go\n",
+ "person_info = []\n",
+ "\n",
+ "#a list for the sections later\n",
+ "section_list = []\n",
+ "\n",
+ "#I have this because I dont know how else to filter out stuff from an if statement that I dont want\n",
+ "count_working = 0\n",
+ "\n",
+ "for page_url in URLs:\n",
+ " print(\"Record Number: \" + str(count_working))\n",
+ " print(\"Case URL: \" + page_url)\n",
+ " count_working += 1\n",
+ " url = base_url + page_url\n",
+ " \n",
+ " # request the html\n",
+ " try:\n",
+ " page = requests.get(url, timeout = 10)\n",
+ " except requests.exceptions.Timeout:\n",
+ " print(\"Timeout occurred\")\n",
+ " # structure the page content for parsing\n",
+ " soup = bs(page.content, 'html.parser') \n",
+ " \n",
+ " # First we have to pull out the content area\n",
+ " content_area = soup.find('main' , {\"property\" : \"mainContentOfPage\"})\n",
+ " \n",
+ " # LOCATION\n",
+ " try:\n",
+ " location_scrape = content_area.find('div')\n",
+ " location_isolate = location_scrape.find_all('p')\n",
+ " location_string = str(location_isolate[2])\n",
+ " location_split = location_string.split(\",\")\n",
+ " province = cleaning_function(location_split[1])\n",
+ " city = cleaning_function(location_split[0])\n",
+ " except:\n",
+ " print('No Location')\n",
+ " \n",
+ " # STATUS\n",
+ " status_scrape = content_area.find_all('h2')\n",
+ " status = status_scrape[:1]\n",
+ " status = str(status)\n",
+ " front_of_status = status.index('') + 4\n",
+ " back_of_status = status.index('
')\n",
+ " status_cleaned = (f'{status[front_of_status : back_of_status]}')\n",
+ " \n",
+ " # FOR THE MISSING ENTRIES\n",
+ " if 'Missing' in status_cleaned:\n",
+ " #Now we get into pulling out individual details which will eventually be compiled in a list\n",
+ " #NAME(MISSING)\n",
+ " name_scrape = content_area.find_all('h3')\n",
+ " person_name = name_scrape[:1]\n",
+ " person_name = str(person_name)\n",
+ " front_of_name = person_name.index('') + 4\n",
+ " back_of_name = person_name.index('
')\n",
+ " name_cleaned = (f'{person_name[front_of_name : back_of_name]}')\n",
+ " name_split = name_cleaned.split(',')\n",
+ " last_name = name_split[0]\n",
+ " first_name = name_split[1]\n",
+ " first_name_string = str(name_split[1:2])\n",
+ " first_name_string = first_name_string.replace('[',\"\")\n",
+ " first_name_string = first_name_string.replace(']',\"\")\n",
+ " first_name_string = first_name_string.replace(\"'\",\"\")\n",
+ " first_name_string = first_name_string.replace(\"\\n\",\"\")\n",
+ " first_name_string = first_name_string.strip()\n",
+ " \n",
+ " #PERSON DETAILS(MISSING)\n",
+ " try:\n",
+ " person_details = content_area.find_all('dd')\n",
+ " date_missing_discovered = person_details[0]\n",
+ " year_born = person_details[1]\n",
+ " age_at_disappearance = person_details[2]\n",
+ " gender = person_details[3]\n",
+ " bio_group = person_details[4]\n",
+ " except:\n",
+ " print('Data error')\n",
+ "\n",
+ " #FOR THE UNIDENTIFIED ENTRIES\n",
+ " else:\n",
+ " try:\n",
+ " first_name_string = 'Unidentified'\n",
+ " last_name = 'Unidentified'\n",
+ " person_details = content_area.find_all('dd')\n",
+ " date_missing_discovered = person_details[0]\n",
+ " age_at_disappearance = person_details[1]\n",
+ " gender = person_details[2]\n",
+ " bio_group = person_details[3]\n",
+ " year_born = 'Unknown'\n",
+ " except:\n",
+ " print('Data error2')\n",
+ " \n",
+ " \n",
+ " #PUT IT ALL TOGETHER\n",
+ " person_info.append([first_name_string , last_name , status_cleaned , cleaning_function(date_missing_discovered) , cleaning_function(year_born) , cleaning_function(age_at_disappearance) , cleaning_function(gender) , cleaning_function(bio_group) , city , province , url])\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "30062b69",
+ "metadata": {},
+ "source": [
+ "### Save the file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b297ad3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# print to a file\n",
+ "full_file = pd.DataFrame(person_info)\n",
+ "full_file.to_csv(\"output_rcmp.csv\")\n",
+ "print('Done')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "72e7fbf7",
+ "metadata": {},
+ "source": [
+ "### To Avoid Running the URL Collector Again - Run Code Below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e5e73d5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('list.csv') as f:\n",
+ " allLines = f.readlines()\n",
+ " TempURLs = list(allLines)\n",
+ " # remove the column header\n",
+ " TempURLs = TempURLs[1:]\n",
+ " f.close()\n",
+ "\n",
+ "# clean the elements \n",
+ "URLs = []\n",
+ "for link in TempURLs:\n",
+ " URLs.append(link.strip())\n",
+ " \n",
+ "print(URLs[:10])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32bc0a32",
+ "metadata": {},
+ "source": [
+ "#### Function to Turn DL sections into dict - No Longer Used!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "40cad4bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_dl_dict(soup):\n",
+ " keys, values = [] , []\n",
+ " for dl in soup.find_all(\"dl\", {\"class\":\"dl-horizontal\"}):\n",
+ " for dt in dl.find_all(\"dt\"):\n",
+ " keys.append(dt.text.strip())\n",
+ " for dd in dl.find_all(\"dd\"):\n",
+ " values.append(dd.text.strip())\n",
+ " \n",
+ " return dict(zip(keys,values))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5513dd44",
+ "metadata": {},
+ "source": [
+ "### Second Method - For More Structured Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f01a313d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#I have this because I dont know how else to filter out stuff from an if statement that I dont want\n",
+ "count_working = 0\n",
+ "\n",
+ "# complete file\n",
+ "complete_db = {}\n",
+ "\n",
+ "# used to test\n",
+ "test_URLs = URLs[:2]\n",
+ "\n",
+ "# loop through all the URLs\n",
+ "for page_url in URLs:\n",
+ " \n",
+ " # page dict\n",
+ " page_dict = {}\n",
+ " #this is where all the person info will go\n",
+ " page_sections = []\n",
+ " # make the full URL\n",
+ " url = base_url + page_url\n",
+ " \n",
+ " print('==============================================')\n",
+ " print(\"Record Number: \" + str(count_working))\n",
+ " print(\"Case URL: \" + url)\n",
+ " \n",
+ " # next record\n",
+ " count_working += 1\n",
+ " \n",
+ " # request the html\n",
+ " try:\n",
+ " page = requests.get(url, timeout = 10)\n",
+ " except requests.exceptions.Timeout:\n",
+ " print(\"Timeout occurred\")\n",
+ " \n",
+ " # structure the page content for parsing\n",
+ " soup = bs(page.content, 'html.parser') \n",
+ " \n",
+ " #print(soup)\n",
+ " \n",
+ " # First we have to pull out the content area\n",
+ " content_area = soup.find('main' , {\"property\" : \"mainContentOfPage\"})\n",
+ " \n",
+ " try:\n",
+ " # the case reference number\n",
+ " _case_ref = content_area.find('h1')\n",
+ " page_dict['CaseRef'] = \" \".join(_case_ref.text.split())\n",
+ " \n",
+ " # the main section\n",
+ " sections = content_area.section\n",
+ " \n",
+ " # the description\n",
+ " desc = sections.div.p\n",
+ " page_dict['CaseDesc'] = desc.text.strip()\n",
+ " \n",
+ " # the category\n",
+ " case_type = sections.h2\n",
+ " page_dict['CaseType'] = \" \".join(case_type.text.split())\n",
+ " except:\n",
+ " print('page base info collection error')\n",
+ " \n",
+ " page_dict[\"CaseURL\"] = url\n",
+ " \n",
+ " \n",
+ " # find all the images in the persons section\n",
+ " try:\n",
+ " # the image link\n",
+ " images = sections.find_all('img')\n",
+ " imgs_list = []\n",
+ " for image in images:\n",
+ " image_src = image['src']\n",
+ " # check if this matches the no photo image\n",
+ " no_photo = re.search(\"noPhoto\\.png\", image_src)\n",
+ " if not no_photo:\n",
+ " # find the iamge ID\n",
+ " img_id = re.search(\"id=(\\d+).*\", image_src)\n",
+ " imgs_list.append(\"https://www.services.rcmp-grc.gc.ca/missing-disparus/showImage?\"+img_id.group())\n",
+ " # add the images section \n",
+ " # add to the main dict\n",
+ " page_dict['PageImages'] = imgs_list\n",
+ " except:\n",
+ " print(\"no images found\")\n",
+ " \n",
+ " \"\"\"\n",
+ " # if we need to treat the page types differently\n",
+ " if page_dict['CaseType'] == 'Missing':\n",
+ " \"\"\"\n",
+ " \n",
+ " # get the first section with all the persons\n",
+ " persons_section = sections.section\n",
+ " \n",
+ " # how many people are we looking through\n",
+ " persons_names = persons_section.find_all('h3')\n",
+ " num_persons = len(persons_names)\n",
+ " # all the blocks within the section\n",
+ " persons_blocks = persons_section.find_all('div',{\"class\":\"row\"})\n",
+ " \n",
+ " # loop through all the person sections to collect their data\n",
+ " # assigned to their names\n",
+ " for i in range(num_persons):\n",
+ " print(\"Person(s) in Case: \"+str(i+1))\n",
+ " block = {} # stores the individuals info, some pages have 1+\n",
+ " block['Name'] = \" \".join(persons_names[i].text.split())\n",
+ " \n",
+ " # select the current persion\n",
+ " current_person = persons_blocks[i]\n",
+ " \n",
+ " # takes all the DL sections out and saves them\n",
+ " dl_sections = []\n",
+ " for dl in current_person.find_all(\"dl\"):\n",
+ " dl_sections.append(str(dl))\n",
+ " block[\"InfoSection\"] = dl_sections \n",
+ " # add the block to the page sections\n",
+ " page_sections.append(block)\n",
+ " print(block['Name'])\n",
+ " #print(persons_blocks[i])\n",
+ " #print(block)\n",
+ " \n",
+ " \"\"\"\n",
+ " # If this is an unidentified persons record\n",
+ " else:\n",
+ " print(\"Un IDs Body\")\n",
+ " \n",
+ " \"\"\"\n",
+ " \n",
+ " # write the section to the dict\n",
+ " page_dict['PersonsData'] = page_sections\n",
+ " # write it all to the main DB\n",
+ " complete_db[page_dict['CaseRef']] = page_dict\n",
+ " \n",
+ "# write JSON to a file \n",
+ "with open(\"Complete_DB.json\", \"w\") as outfile:\n",
+ " json.dump(complete_db, outfile)\n",
+ " \n",
+ "print('======================= Done =======================')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6123cd3b",
+ "metadata": {},
+ "source": [
+ "## Write to a file if it was not done above"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "41cf2eed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# write JSON to a file \n",
+ "with open(\"Complete_DB.json\", \"w\") as outfile:\n",
+ " json.dump(complete_db, outfile)"
]
}
],
"metadata": {
+ "interpreter": {
+ "hash": "2516770f1231e9470eebfb2e8e89faf6b4b2c173f0d9550afe423a3c1e5f866c"
+ },
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
@@ -157,7 +551,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.10.0"
}
},
"nbformat": 4,
diff --git a/Makefile b/Makefile
index bbfa07f2f..e943c1faf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,12 @@
-setup: ## Install packages via pipenv
+setup:
pipenv install
-open: ## Start the Jupyter notebook
+open-viewer: setup ## Open the notebook in an interactive viewer
pipenv run jupyter notebook MissingPersons_DataExtract_Tom_May10.ipynb --config jupyter_notebook_config.py
+run-code: setup ## Run the notebook's code in terminal (without viewer)
+ pipenv run jupyter nbconvert --to notebook --execute MissingPersons_DataExtract_Tom_May10.ipynb
+
%:
@true
diff --git a/MissingPersons_DataExtract_Tom_May10.ipynb b/MissingPersons_DataExtract_Tom_May10.ipynb
index 3f4355851..a57d81494 100644
--- a/MissingPersons_DataExtract_Tom_May10.ipynb
+++ b/MissingPersons_DataExtract_Tom_May10.ipynb
@@ -46,7 +46,6 @@
"import requests\n",
"import json\n",
"import csv\n",
- "from webdriver_manager.chrome import ChromeDriverManager\n",
"from selenium.webdriver.common.by import By"
]
},
@@ -57,9 +56,13 @@
"metadata": {},
"outputs": [],
"source": [
- "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n",
- "# Get a chrome driver if there isn't one locally\n",
- "browser = webdriver.Chrome(ChromeDriverManager().install()) "
+ "from webdriver_manager.chrome import ChromeDriverManager\n",
+ "from selenium.webdriver.chrome.service import Service\n",
+ "from selenium.webdriver.chrome.options import Options\n",
+ "\n",
+ "# Allow Chrome to run in headless mode (required for running in GitHub Actions)\n",
+ "options = Options()\n",
+ "options.headless = True"
]
},
{
@@ -71,6 +74,10 @@
},
"outputs": [],
"source": [
+ "# Open Chrome browser. (get chrome driver if it isn't installed already)\n",
+ "browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)\n",
+ "\n",
+ "link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'\n",
"browser.get(link)\n",
"time.sleep(2)\n",
"\n",
diff --git a/README.md b/README.md
index 96a9eb920..ec8a14532 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,37 @@ Recommendations:
- e.g., `brew install pyenv`
- then `pyenv install --skip-existing` within this project, to install a known-good version of Python.
+To list all available commands, run `make` (without any arguments):
+
```
-make setup
-make install
+$ make
+Usage: make
+
+where is one of the following:
+
+open-viewer Open the notebook in an interactive viewer
+run-code Run the notebook's code in terminal (without viewer)
```
Note: If you don't wish to use `pipenv`, just open up `Makefile` and run its commands directly.
+## Scheduled Cloud Run
+
+This scraper is intended to be run nightly in a cloud environment. We use GitHub Actions for this.
+
+Configuration: [`.github/workflows/scrape.yml`](/.github/workflows/scrape.yml)
+Run History: [GitHub Action runs](https://github.com/CivicTechTO/missing-persons/actions/workflows/scrape.yml)
+
+Each successful script run in the cloud environment will save a zip file of the
+generated files. You can find this on the "Summary" page of any workflow run.
+
+For example:
+https://github.com/CivicTechTO/missing-persons/actions/runs/2382053305
+
+While this workflow will run automatically each night, you can force a run
+anytime. To do this, push to any branch a commit with `[force ci]` in the most
+recent commit message.
+
### 1. Web Scraping - Setting up Jupyter Notebooks & Selenium
The required packages can be found in requirments.txt, bs4 and selenium are the main non-standard packages needed.