From 54fc6350e760b096e40e4094f0b9faf3673aff8b Mon Sep 17 00:00:00 2001 From: SoulSniper1212 Date: Sun, 9 Nov 2025 18:47:29 -0500 Subject: [PATCH] Update Web Scraper README with detailed documentation Signed-off-by: SoulSniper1212 --- Web Scraper/README.md | 83 ++++++++++- Web Scraper/Web_Scraper.py | 10 +- Web Scraper/example_usage.py | 70 ++++++++++ Web Scraper/google_web_scraper.py | 221 ++++++++++++++++++++++++++++++ Web Scraper/requirements.txt | 1 + 5 files changed, 378 insertions(+), 7 deletions(-) create mode 100644 Web Scraper/example_usage.py create mode 100644 Web Scraper/google_web_scraper.py create mode 100644 Web Scraper/requirements.txt diff --git a/Web Scraper/README.md b/Web Scraper/README.md index 5c796460..ce3500ed 100644 --- a/Web Scraper/README.md +++ b/Web Scraper/README.md @@ -1,8 +1,87 @@ -In this script, we use the `requests` library to send a GET request to the Python.org blogs page. We then use the `BeautifulSoup` library to parse the HTML content of the page. +# Web Scraper -We find all the blog titles on the page by searching for `h2` elements with the class `blog-title`. We then print each title found and save them to a file named `blog_titles.txt`. +This repository contains two web scraping scripts: +## 1. Traditional Web Scraper (`Web_Scraper.py`) + +This script uses the `requests` library to send a GET request to the Python.org blogs page. It then uses the `BeautifulSoup` library to parse the HTML content of the page. + +It finds all the blog titles on the page by searching for `h2` elements with the class `blog-title`. It then prints each title found and saves them to a file named `blog_titles.txt`. + +### Usage To run this script, first install the required libraries: ```bash pip install requests beautifulsoup4 +``` + +Then run: + +```bash +python Web_Scraper.py +``` + +## 2. Google Custom Search Scraper (`google_web_scraper.py`) + +This enhanced CLI web scraper uses the Google Custom Search API to extract URLs, titles, and snippets from search results. This approach is more robust than traditional web scraping methods as it: + +- Bypasses CAPTCHA challenges that may occur during direct web scraping +- Retrieves structured data (title, URL, and snippet/description) +- Handles dynamic websites more reliably +- Is less prone to breaking when website structures change +- Allows searching by keyword to retrieve multiple metadata fields + +### Prerequisites +Before using this script, you need: +1. A Google API Key from [Google Cloud Console](https://console.cloud.google.com/apis/credentials) +2. A Custom Search Engine ID from [Google Programmable Search Engine](https://programmablesearchengine.google.com/) + +### Installation +```bash +pip install -r requirements.txt +``` + +### Setup +Set your API credentials as environment variables: +```bash +export GOOGLE_API_KEY='your_google_api_key' +export SEARCH_ENGINE_ID='your_search_engine_id' +``` + +Alternatively, you can pass them directly as command-line arguments. + +### Usage +Basic usage: +```bash +python google_web_scraper.py --query "Python tutorials" --results 10 +``` + +Save results in JSON format: +```bash +python google_web_scraper.py --query "machine learning blogs" --results 20 --format json +``` + +Specify output file: +```bash +python google_web_scraper.py --query "web development news" --output my_search.json --format json +``` + +With API credentials as arguments: +```bash +python google_web_scraper.py --query "Python tutorials" --api-key YOUR_API_KEY --engine-id YOUR_ENGINE_ID +``` + +### Options +- `--query, -q`: Search query to use for web scraping (required) +- `--results, -r`: Number of search results to retrieve (default: 10) +- `--output, -o`: Output file name (default: search_results.txt) +- `--format, -f`: Output format: txt or json (default: txt) +- `--api-key, -k`: Google API Key (optional) +- `--engine-id, -e`: Google Custom Search Engine ID (optional) + +### Features +- Command-line interface with configurable options +- Support for both TXT and JSON output formats +- Environment variable support for credentials +- Error handling and user-friendly messages +- Ability to retrieve multiple pages of results \ No newline at end of file diff --git a/Web Scraper/Web_Scraper.py b/Web Scraper/Web_Scraper.py index f1f0d62b..e8fe64ef 100644 --- a/Web Scraper/Web_Scraper.py +++ b/Web Scraper/Web_Scraper.py @@ -1,6 +1,10 @@ import requests from bs4 import BeautifulSoup +print("This is the traditional web scraper using BeautifulSoup.") +print("For a more robust solution using Google Custom Search API, see 'google_web_scraper.py'") +print() + # URL to scrape data from URL = "https://www.python.org/blogs/" @@ -23,8 +27,4 @@ for title in titles: file.write(title.get_text(strip=True) + "\n") -print("\nBlog titles saved to 'blog_titles.txt'.") - - - - \ No newline at end of file +print("\nBlog titles saved to 'blog_titles.txt'.") \ No newline at end of file diff --git a/Web Scraper/example_usage.py b/Web Scraper/example_usage.py new file mode 100644 index 00000000..a8b58007 --- /dev/null +++ b/Web Scraper/example_usage.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Example usage of the Google Custom Search Scraper +This demonstrates how to use the scraper programmatically +""" +import os +from google_web_scraper import GoogleSearchScraper + + +def example_basic_usage(): + """Example of basic usage""" + # Initialize the scraper with API credentials + # These can be set as environment variables: GOOGLE_API_KEY and SEARCH_ENGINE_ID + api_key = os.getenv('GOOGLE_API_KEY') + search_engine_id = os.getenv('SEARCH_ENGINE_ID') + + if not api_key or not search_engine_id: + print("Please set GOOGLE_API_KEY and SEARCH_ENGINE_ID environment variables") + return + + try: + scraper = GoogleSearchScraper(api_key=api_key, search_engine_id=search_engine_id) + + # Search for Python tutorials + results = scraper.search("Python tutorials", num_results=5) + + print(f"Found {len(results)} results:") + for i, result in enumerate(results, 1): + title = result.get('title', 'No title') + link = result.get('link', 'No URL') + snippet = result.get('snippet', 'No snippet') + print(f"{i}. {title}") + print(f" URL: {link}") + print(f" Snippet: {snippet}") + print() + except Exception as e: + print(f"Error during search: {e}") + + +def example_multiple_pages(): + """Example of searching multiple pages""" + api_key = os.getenv('GOOGLE_API_KEY') + search_engine_id = os.getenv('SEARCH_ENGINE_ID') + + if not api_key or not search_engine_id: + print("Please set GOOGLE_API_KEY and SEARCH_ENGINE_ID environment variables") + return + + try: + scraper = GoogleSearchScraper(api_key=api_key, search_engine_id=search_engine_id) + + # Search for multiple pages of results + results = scraper.search_multiple_pages("machine learning", total_results=15) + + print(f"Found {len(results)} results for 'machine learning':") + for i, result in enumerate(results, 1): + title = result.get('title', 'No title') + link = result.get('link', 'No URL') + print(f"{i:2d}. {title}") + print(f" URL: {link}") + print() + except Exception as e: + print(f"Error during search: {e}") + + +if __name__ == "__main__": + print("=== Basic Usage Example ===") + example_basic_usage() + print("\n=== Multiple Pages Example ===") + example_multiple_pages() \ No newline at end of file diff --git a/Web Scraper/google_web_scraper.py b/Web Scraper/google_web_scraper.py new file mode 100644 index 00000000..e3f679ac --- /dev/null +++ b/Web Scraper/google_web_scraper.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +""" +CLI Web Scraper using Google Custom Search API +This script allows users to search for websites and extract titles, URLs, and snippets +using Google Custom Search API, making it more robust than traditional web scraping. +""" +import os +import sys +import json +import argparse +import requests + + +class GoogleSearchScraper: + """ + A web scraper that uses Google Custom Search API to extract URLs, titles, and snippets. + """ + + def __init__(self, api_key=None, search_engine_id=None): + """ + Initialize the scraper with API credentials. + + :param api_key: Google API Key + :param search_engine_id: Google Custom Search Engine ID + """ + self.api_key = api_key or os.getenv('GOOGLE_API_KEY') + self.search_engine_id = search_engine_id or os.getenv('SEARCH_ENGINE_ID') + + if not self.api_key: + raise ValueError( + "Google API Key is required. " + "Set GOOGLE_API_KEY environment variable or pass as parameter." + ) + if not self.search_engine_id: + raise ValueError( + "Search Engine ID is required. " + "Set SEARCH_ENGINE_ID environment variable or pass as parameter." + ) + + self.base_url = "https://www.googleapis.com/customsearch/v1" + + def search(self, query, num_results=10, start_index=1): + """ + Search using Google Custom Search API and extract results. + + :param query: Search query + :param num_results: Number of results to return (max 10 per request) + :param start_index: Starting index for results (1-based) + :return: List of dictionaries containing title, URL, and snippet + """ + # The API allows maximum 10 results per request + if num_results > 10: + num_results = 10 + + params = { + 'key': self.api_key, + 'cx': self.search_engine_id, + 'q': query, + 'num': num_results, + 'start': start_index + } + + response = requests.get(self.base_url, params=params, timeout=10) + + if response.status_code != 200: + raise Exception(f"Error from Google API: {response.status_code} - {response.text}") + + data = response.json() + + results = [] + if 'items' in data: + for item in data['items']: + result = { + 'title': item.get('title', ''), + 'link': item.get('link', ''), + 'snippet': item.get('snippet', '') + } + results.append(result) + + return results + + def search_multiple_pages(self, query, total_results=10): + """ + Search and retrieve multiple pages of results. + + :param query: Search query + :param total_results: Total number of results desired + :return: List of all results + """ + all_results = [] + start_index = 1 + + while len(all_results) < total_results: + num_to_fetch = min(10, total_results - len(all_results)) + results = self.search(query, num_to_fetch, start_index) + + if not results: + break + + all_results.extend(results) + start_index += 10 + + # Break if we have reached the desired number of results + if len(all_results) >= total_results: + break + + return all_results[:total_results] + + +def save_results_to_file(results, filename, format_type='txt'): + """ + Save search results to a file in the specified format. + + :param results: List of search results + :param filename: Name of the file to save to + :param format_type: Format to save ('txt' or 'json') + """ + if format_type.lower() == 'json': + with open(filename, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + else: # default to txt format + with open(filename, 'w', encoding='utf-8') as f: + for i, result in enumerate(results, 1): + f.write(f"Result {i}:\n") + f.write(f"Title: {result['title']}\n") + f.write(f"URL: {result['link']}\n") + f.write(f"Snippet: {result['snippet']}\n") + f.write("-" * 50 + "\n") + + +def main(): + parser = argparse.ArgumentParser( + description='CLI Web Scraper using Google Custom Search API', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --query "Python tutorials" --results 10 + %(prog)s --query "machine learning blogs" --results 20 --format json + %(prog)s --query "web development news" --output my_search.txt + """ + ) + + parser.add_argument( + '--query', + '-q', + required=True, + help='Search query to use for web scraping' + ) + + parser.add_argument( + '--results', + '-r', + type=int, + default=10, + help='Number of search results to retrieve (default: 10)' + ) + + parser.add_argument( + '--output', + '-o', + default='search_results.txt', + help='Output file name (default: search_results.txt)' + ) + + parser.add_argument( + '--format', + '-f', + choices=['txt', 'json'], + default='txt', + help='Output format: txt or json (default: txt)' + ) + + parser.add_argument( + '--api-key', + '-k', + help='Google API Key (optional, will use GOOGLE_API_KEY env var if not provided)' + ) + + parser.add_argument( + '--engine-id', + '-e', + help='Google Custom Search Engine ID (optional, will use SEARCH_ENGINE_ID env var if not provided)' + ) + + args = parser.parse_args() + + try: + scraper = GoogleSearchScraper(api_key=args.api_key, search_engine_id=args.engine_id) + results = scraper.search_multiple_pages(args.query, args.results) + + if not results: + print("No results found for the query.") + return + + print(f"Found {len(results)} results for query '{args.query}':\n") + + for i, result in enumerate(results, 1): + print(f"{i}. {result['title']}") + print(f" URL: {result['link']}") + print(f" Snippet: {result['snippet']}") + print() + + save_results_to_file(results, args.output, args.format) + print(f"Results saved to '{args.output}' in {args.format.upper()} format.") + + except ValueError as e: + print(f"Configuration Error: {e}", file=sys.stderr) + print("\nTo use this script, you need to provide Google API credentials:") + print("1. Get a Google API key from: https://console.cloud.google.com/apis/credentials") + print("2. Create a Custom Search Engine from: https://programmablesearchengine.google.com/") + print("3. Set environment variables or use command-line parameters:") + print(" export GOOGLE_API_KEY='your_api_key'") + print(" export SEARCH_ENGINE_ID='your_search_engine_id'") + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/Web Scraper/requirements.txt b/Web Scraper/requirements.txt new file mode 100644 index 00000000..57eb557f --- /dev/null +++ b/Web Scraper/requirements.txt @@ -0,0 +1 @@ +requests>=2.32.5,<3.0 \ No newline at end of file