diff --git a/main.py b/main.py index bf9207c9..cdc32b66 100755 --- a/main.py +++ b/main.py @@ -12,6 +12,7 @@ from torbot.modules.updater import check_version from torbot.modules.info import execute_all, fetch_html from torbot.modules.linktree import LinkTree +from torbot.modules.deep_extract import DeepExtractor def print_tor_ip_address(client: httpx.Client) -> None: @@ -52,6 +53,63 @@ def print_header(version: str) -> None: print(title) +def handle_deep_extraction(tree: LinkTree, client: httpx.Client, export_path: str = None) -> None: + """ + Handle deep content extraction from crawled pages. + + Args: + tree: LinkTree object with crawled URLs + client: HTTP client for making requests + export_path: Optional path to export intelligence data + """ + logging.info("Starting deep content extraction...") + deep_extractor = DeepExtractor() + + # Extract content from each page in the tree + pages_analyzed = 0 + for node_url in tree.nodes: + try: + logging.debug(f"Extracting from: {node_url}") + response = client.get(node_url) + if response.status_code == 200: + deep_extractor.extract_all(response.text, node_url) + pages_analyzed += 1 + except Exception as e: + logging.warning(f"Could not extract from {node_url}: {str(e)}") + + logging.info(f"Deep extraction complete. Analyzed {pages_analyzed} pages.") + + # Print summary + deep_extractor.print_summary() + + # Export to JSON if requested + if export_path: + logging.info(f"Exporting intelligence to {export_path}...") + deep_extractor.export_to_json(export_path) + + # Also create a text report + base_path = export_path.rsplit('.', 1)[0] if '.' in export_path else export_path + text_report_path = f"{base_path}_report.txt" + deep_extractor.export_to_text(text_report_path) + logging.info(f"Text report saved to {text_report_path}") + + +def handle_visualization(tree: LinkTree, visualize_mode: str = None) -> None: + """ + Handle visualization of crawled data. + + Args: + tree: LinkTree object with crawled data + visualize_mode: Visualization mode (table, tree, json) + """ + if visualize_mode == "table" or not visualize_mode: + tree.showTable() + elif visualize_mode == "tree": + print(tree) + elif visualize_mode == "json": + tree.showJSON() + + def run(arg_parser: argparse.ArgumentParser, version: str) -> None: args = arg_parser.parse_args() @@ -66,7 +124,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: arg_parser.print_help() sys.exit() - # Print verison then exit + # Print version then exit if args.version: print(f"TorBot Version: {version}") sys.exit() @@ -93,6 +151,10 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: tree = LinkTree(url=args.url, depth=args.depth, client=client) tree.load() + # Deep extraction if requested + if args.deep_extract: + handle_deep_extraction(tree, client, args.export_intel) + # save data if desired if args.save == "tree": tree.save() @@ -105,12 +167,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: fetch_html(client, args.url, tree, save_html=True) # always print something, table is the default - if args.visualize == "table" or not args.visualize: - tree.showTable() - elif args.visualize == "tree": - print(tree) - elif args.visualize == "json": - tree.showJSON() + handle_visualization(tree, args.visualize) print("\n\n") @@ -123,10 +180,10 @@ def set_arguments() -> argparse.ArgumentParser: prog="TorBot", usage="Gather and analayze data from Tor sites." ) parser.add_argument( - "-u", "--url", type=str, required=True, help="Specifiy a website link to crawl" + "-u", "--url", type=str, required=True, help="Specify a website link to crawl" ) parser.add_argument( - "--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1 + "--depth", type=int, help="Specify max depth of crawler (default 1)", default=1 ) parser.add_argument( "--host", type=str, help="IP address for SOCKS5 proxy", default="127.0.0.1" @@ -162,9 +219,16 @@ def set_arguments() -> argparse.ArgumentParser: help="Executes HTTP requests without using SOCKS5 proxy", ) parser.add_argument( - "--html", - choices=["save", "display"], - help="Saves / Displays the html of the onion link", + + "--deep-extract", + action="store_true", + help="Enable deep content extraction mode for OSINT intelligence gathering", + ) + parser.add_argument( + "--export-intel", + type=str, + metavar="FILENAME", + help="Export extracted intelligence to JSON file (use with --deep-extract)", ) return parser diff --git a/requirements.txt b/requirements.txt index c430132c..8d3ee09a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -256,19 +256,4 @@ urllib3==2.5.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \ validators==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \ --hash=sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a yattag==1.15.1 ; python_version >= "3.9" and python_full_version <= "3.11.4" \ - --hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416 - -numpy~=1.24.4 -beautifulsoup4~=4.11.1 -sklearn~=0.0 -scikit-learn~=1.3.0 -httpx[socks]~=0.25.0 -yattag~=1.15.1 -termcolor~=1.1.0 -python-dotenv~=0.20.0 -Unipath~=1.1 -validators~=0.20.0 -phonenumbers~=8.13.22 -tabulate~=0.9.0 -treelib~=1.7.0 -toml~=0.10.2 \ No newline at end of file + --hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416 \ No newline at end of file diff --git a/src/torbot/modules/deep_extract/__init__.py b/src/torbot/modules/deep_extract/__init__.py new file mode 100644 index 00000000..c25612de --- /dev/null +++ b/src/torbot/modules/deep_extract/__init__.py @@ -0,0 +1,12 @@ +""" +Deep Web Content Extraction Module + +This module provides comprehensive content extraction and intelligence gathering +capabilities for dark web OSINT investigations. +""" + +from .orchestrator import DeepExtractor +from .base import BaseExtractor, ExtractionResult + +__all__ = ['DeepExtractor', 'BaseExtractor', 'ExtractionResult'] + diff --git a/src/torbot/modules/deep_extract/base.py b/src/torbot/modules/deep_extract/base.py new file mode 100644 index 00000000..24b3460f --- /dev/null +++ b/src/torbot/modules/deep_extract/base.py @@ -0,0 +1,190 @@ +""" +Base classes and utilities for deep content extraction +""" + +import re +from typing import List, Dict, Any, Optional +from dataclasses import dataclass, field +from abc import ABC, abstractmethod +from datetime import datetime + + +@dataclass +class ExtractionResult: + """Container for extracted intelligence data""" + + category: str # Type of extraction (credentials, pii, crypto, etc.) + confidence: float # Confidence score (0.0 to 1.0) + risk_level: str # low, medium, high, critical + data: Dict[str, Any] # The actual extracted data + context: Optional[str] = None # Surrounding context + location: Optional[str] = None # Location in page (URL, line number, etc.) + timestamp: datetime = field(default_factory=datetime.now) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization""" + return { + 'category': self.category, + 'confidence': self.confidence, + 'risk_level': self.risk_level, + 'data': self.data, + 'context': self.context, + 'location': self.location, + 'timestamp': self.timestamp.isoformat() + } + + +class BaseExtractor(ABC): + """Base class for all content extractors""" + + def __init__(self): + self.results: List[ExtractionResult] = [] + + @abstractmethod + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: + """ + Extract intelligence from text content + + Args: + text: The text content to analyze + url: The source URL (optional) + + Returns: + List of ExtractionResult objects + """ + pass + + def get_context(self, text: str, match_start: int, match_end: int, + context_chars: int = 100) -> str: + """ + Extract surrounding context for a match + + Args: + text: Full text content + match_start: Start position of match + match_end: End position of match + context_chars: Number of characters to include on each side + + Returns: + Context string + """ + start = max(0, match_start - context_chars) + end = min(len(text), match_end + context_chars) + context = text[start:end] + + # Clean up context + context = context.replace('\n', ' ').replace('\r', ' ') + context = re.sub(r'\s+', ' ', context).strip() + + return context + + def calculate_risk_level(self, data_type: str, confidence: float) -> str: + """ + Calculate risk level based on data type and confidence + + Args: + data_type: Type of sensitive data found + confidence: Confidence score + + Returns: + Risk level string + """ + critical_types = ['password', 'ssn', 'credit_card', 'api_key', 'private_key'] + high_types = ['email', 'phone', 'bitcoin', 'credential_dump'] + medium_types = ['onion_link', 'ip_address', 'hash'] + + if data_type in critical_types and confidence > 0.7: + return 'critical' + elif data_type in high_types and confidence > 0.6: + return 'high' + elif data_type in medium_types and confidence > 0.5: + return 'medium' + else: + return 'low' + + +class RegexPatterns: + """Common regex patterns for extraction""" + + # Email patterns + EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' + + # Cryptocurrency addresses + BITCOIN = r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b' + ETHEREUM = r'\b0x[a-fA-F0-9]{40}\b' + MONERO = r'\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b' + LITECOIN = r'\b[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}\b' + + # Onion links + ONION_V2 = r'\b[a-z2-7]{16}\.onion\b' + ONION_V3 = r'\b[a-z2-7]{56}\.onion\b' + + # Network indicators + IPV4 = r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b' + IPV6 = r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b' + DOMAIN = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b' + + # PII + PHONE = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b' + SSN = r'\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b' + CREDIT_CARD = r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12})\b' + + # Credentials + USERNAME_PASSWORD = r'(?i)(?:username|user|login|email)[\s:=]+([^\s:]+)[\s\n\r]*(?:password|pass|pwd)[\s:=]+([^\s\n\r]+)' + API_KEY_AWS = r'\b(?:AKIA|ASIA)[0-9A-Z]{16}\b' + API_KEY_GENERIC = r'\b[a-zA-Z0-9_-]{32,}\b' + JWT_TOKEN = r'\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b' + + # Hashes + MD5 = r'\b[a-fA-F0-9]{32}\b' + SHA1 = r'\b[a-fA-F0-9]{40}\b' + SHA256 = r'\b[a-fA-F0-9]{64}\b' + + # Communication + PGP_KEY = r'-----BEGIN PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----' + PGP_FINGERPRINT = r'\b[0-9A-F]{40}\b' + JABBER = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\s|$)' + TELEGRAM = r'(?:@|t\.me/)[a-zA-Z0-9_]{5,32}' + WICKR = r'(?i)wickr(?:\s*:?\s*|me\s*:?\s*)([a-zA-Z0-9_-]{5,20})' + + # CVE + CVE = r'\bCVE-\d{4}-\d{4,7}\b' + + +class LuhnValidator: + """Luhn algorithm for credit card validation""" + + @staticmethod + def validate(number: str) -> bool: + """ + Validate credit card number using Luhn algorithm + + Args: + number: Credit card number string + + Returns: + True if valid, False otherwise + """ + try: + # Remove any spaces or dashes + number = number.replace(' ', '').replace('-', '') + + if not number.isdigit(): + return False + + # Luhn algorithm + total = 0 + reverse_digits = number[::-1] + + for i, digit in enumerate(reverse_digits): + n = int(digit) + if i % 2 == 1: + n *= 2 + if n > 9: + n -= 9 + total += n + + return total % 10 == 0 + except (ValueError, AttributeError): + return False + diff --git a/src/torbot/modules/deep_extract/breach_detector.py b/src/torbot/modules/deep_extract/breach_detector.py new file mode 100644 index 00000000..f1eb5f91 --- /dev/null +++ b/src/torbot/modules/deep_extract/breach_detector.py @@ -0,0 +1,299 @@ +""" +Data Breach Detection Extractor +""" + +import re +from typing import List, Dict +from .base import BaseExtractor, ExtractionResult + + +class BreachDetector(BaseExtractor): + """Detect and analyze data breach dumps and credential leaks""" + + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: + """Detect data breaches and credential dumps""" + results = [] + + # Detect breach announcements + results.extend(self._detect_breach_announcements(text, url)) + + # Detect credential dumps + results.extend(self._detect_credential_dumps(text, url)) + + # Detect database leaks + results.extend(self._detect_database_leaks(text, url)) + + # Detect combo lists + results.extend(self._detect_combo_lists(text, url)) + + # Estimate breach size + results.extend(self._estimate_breach_size(text, url)) + + self.results.extend(results) + return results + + def _detect_breach_announcements(self, text: str, url: str) -> List[ExtractionResult]: + """Detect breach announcements and posts""" + results = [] + + breach_keywords = [ + r'(?i)\b(?:database|db|data)\s+(?:breach|leak|dump|hacked|compromised)\b', + r'(?i)\b(?:breach|leak)\s+(?:of|from)\s+([A-Z][a-zA-Z0-9\s]{2,30})\b', + r'(?i)\bhacked\s+database\b', + r'(?i)\bdata\s+dump\b', + r'(?i)\bstolen\s+(?:database|data|credentials)\b' + ] + + for pattern in breach_keywords: + for match in re.finditer(pattern, text): + context = self.get_context(text, match.start(), match.end(), 250) + + # Try to extract company/target name + target = match.group(1) if match.lastindex else None + + results.append(ExtractionResult( + category='data_breach', + confidence=0.85, + risk_level='critical', + data={ + 'type': 'breach_announcement', + 'target': target, + 'indicator': match.group(0) + }, + context=context, + location=url + )) + + # Specific breach format patterns + breach_format_patterns = [ + r'(?i)(?:database|dump):\s*([^\n]{10,100})', + r'(?i)source:\s*([^\n]{5,100})', + r'(?i)leaked\s+from:\s*([^\n]{5,100})' + ] + + for pattern in breach_format_patterns: + for match in re.finditer(pattern, text): + source_info = match.group(1).strip() + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='data_breach', + confidence=0.8, + risk_level='high', + data={ + 'type': 'breach_source', + 'source': source_info + }, + context=context, + location=url + )) + + return results + + def _detect_credential_dumps(self, text: str, url: str) -> List[ExtractionResult]: + """Detect credential dump patterns""" + results = [] + + # Count email:password patterns + email_pass_pattern = r'(?m)^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}:[^\s:]+$' + matches = list(re.finditer(email_pass_pattern, text)) + + if len(matches) >= 5: # If we find multiple credential pairs + # Get first and last match for context + first_match = matches[0] + last_match = matches[-1] + + context = self.get_context( + text, + first_match.start(), + min(last_match.end(), first_match.start() + 500), + 100 + ) + + # Extract sample credentials (first 3) + samples = [matches[i].group(0) for i in range(min(3, len(matches)))] + + results.append(ExtractionResult( + category='data_breach', + confidence=0.95, + risk_level='critical', + data={ + 'type': 'credential_dump', + 'format': 'email:password', + 'estimated_count': len(matches), + 'samples': samples + }, + context=context, + location=url + )) + + # Count username:password patterns + user_pass_pattern = r'(?m)^[a-zA-Z0-9_-]{3,20}:[^\s:]+$' + matches = list(re.finditer(user_pass_pattern, text)) + + if len(matches) >= 10: # Higher threshold for username:password + first_match = matches[0] + context = self.get_context(text, first_match.start(), first_match.start() + 500) + + samples = [matches[i].group(0) for i in range(min(3, len(matches)))] + + results.append(ExtractionResult( + category='data_breach', + confidence=0.85, + risk_level='critical', + data={ + 'type': 'credential_dump', + 'format': 'username:password', + 'estimated_count': len(matches), + 'samples': samples + }, + context=context, + location=url + )) + + return results + + def _detect_database_leaks(self, text: str, url: str) -> List[ExtractionResult]: + """Detect database dump patterns""" + results = [] + + # SQL dump indicators + sql_patterns = [ + r'(?i)(?:INSERT INTO|CREATE TABLE|DROP TABLE)', + r'(?i)(?:mysql|postgresql|mongodb|mssql)\s+dump', + r'(?i)\.sql\s+(?:file|dump|backup)' + ] + + for pattern in sql_patterns: + matches = list(re.finditer(pattern, text)) + if matches: + first_match = matches[0] + context = self.get_context(text, first_match.start(), first_match.end(), 200) + + results.append(ExtractionResult( + category='data_breach', + confidence=0.9, + risk_level='high', + data={ + 'type': 'database_dump', + 'dump_type': 'SQL', + 'indicator_count': len(matches) + }, + context=context, + location=url + )) + break # Only report once per type + + # JSON database dumps + if text.count('"password"') >= 5 or text.count('"email"') >= 5: + # Look for JSON array of user objects + json_user_pattern = r'\{\s*"(?:email|username|user)"[^}]{10,200}"password"[^}]{5,100}\}' + matches = list(re.finditer(json_user_pattern, text, re.IGNORECASE)) + + if len(matches) >= 3: + first_match = matches[0] + context = self.get_context(text, first_match.start(), first_match.end(), 150) + + results.append(ExtractionResult( + category='data_breach', + confidence=0.85, + risk_level='high', + data={ + 'type': 'database_dump', + 'dump_type': 'JSON', + 'record_count': len(matches) + }, + context=context, + location=url + )) + + return results + + def _detect_combo_lists(self, text: str, url: str) -> List[ExtractionResult]: + """Detect combo lists (credential lists from multiple sources)""" + results = [] + + combo_keywords = [ + r'(?i)\bcombo\s+list\b', + r'(?i)\bcombos\b.*\b(?:million|thousand|k)\b', + r'(?i)\bmixed\s+(?:credentials|combos)\b', + r'(?i)\b(?:private|fresh)\s+combos\b' + ] + + for pattern in combo_keywords: + for match in re.finditer(pattern, text): + context = self.get_context(text, match.start(), match.end(), 250) + + # Try to extract size + size_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:million|mil|m|thousand|k)', context, re.IGNORECASE) + estimated_size = None + if size_match: + num = float(size_match.group(1)) + unit = size_match.group(2).lower() + if 'm' in unit: + estimated_size = int(num * 1000000) + elif 'k' in unit: + estimated_size = int(num * 1000) + + results.append(ExtractionResult( + category='data_breach', + confidence=0.9, + risk_level='critical', + data={ + 'type': 'combo_list', + 'estimated_size': estimated_size + }, + context=context, + location=url + )) + + return results + + def _estimate_breach_size(self, text: str, url: str) -> List[ExtractionResult]: + """Estimate the size of data breaches mentioned""" + results = [] + + size_patterns = [ + r'(?i)(\d+(?:\.\d+)?)\s*(million|billion|thousand|mil|m|k|gb|mb)\s+(?:records?|users?|accounts?|credentials?|passwords?|emails?)', + r'(?i)(?:contains|includes|total)[:\s]+(\d+(?:\.\d+)?)\s*(million|billion|thousand|mil|m|k)\s+(?:records?|entries?)', + ] + + for pattern in size_patterns: + for match in re.finditer(pattern, text): + number = float(match.group(1)) + unit = match.group(2).lower() + context = self.get_context(text, match.start(), match.end()) + + # Convert to actual count + multiplier = 1 + if 'billion' in unit or 'b' == unit: + multiplier = 1000000000 + elif 'million' in unit or 'm' in unit: + multiplier = 1000000 + elif 'thousand' in unit or 'k' in unit: + multiplier = 1000 + + estimated_count = int(number * multiplier) + + # Determine risk level based on size + risk_level = 'medium' + if estimated_count >= 1000000: + risk_level = 'critical' + elif estimated_count >= 100000: + risk_level = 'high' + + results.append(ExtractionResult( + category='data_breach', + confidence=0.85, + risk_level=risk_level, + data={ + 'type': 'breach_size', + 'estimated_records': estimated_count, + 'original_value': f"{number} {unit}" + }, + context=context, + location=url + )) + + return results + diff --git a/src/torbot/modules/deep_extract/communication_extractor.py b/src/torbot/modules/deep_extract/communication_extractor.py new file mode 100644 index 00000000..ff2444eb --- /dev/null +++ b/src/torbot/modules/deep_extract/communication_extractor.py @@ -0,0 +1,289 @@ +""" +Communication Methods Extractor +""" + +import re +from typing import List +from .base import BaseExtractor, ExtractionResult, RegexPatterns + + +class CommunicationExtractor(BaseExtractor): + """Extract communication methods and contact information""" + + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: + """Extract various communication methods""" + results = [] + + # Extract PGP keys + results.extend(self._extract_pgp_keys(text, url)) + + # Extract messaging app IDs + results.extend(self._extract_messaging_ids(text, url)) + + # Extract email addresses (for communication) + results.extend(self._extract_contact_emails(text, url)) + + # Extract IRC channels + results.extend(self._extract_irc_channels(text, url)) + + self.results.extend(results) + return results + + def _extract_pgp_keys(self, text: str, url: str) -> List[ExtractionResult]: + """Extract PGP public keys and fingerprints""" + results = [] + + # PGP key blocks + pgp_block_pattern = r'-----BEGIN PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----(.*?)-----END PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----' + for match in re.finditer(pgp_block_pattern, text, re.DOTALL): + key_content = match.group(1).strip() + context = self.get_context(text, match.start(), match.end(), 100) + + key_type = 'PUBLIC' if 'PUBLIC KEY' in match.group(0) else 'PRIVATE' + + results.append(ExtractionResult( + category='communication', + confidence=0.95, + risk_level='high' if key_type == 'PRIVATE' else 'medium', + data={ + 'type': 'pgp_key_block', + 'key_type': key_type, + 'key_preview': key_content[:100] + '...' if len(key_content) > 100 else key_content + }, + context=context, + location=url + )) + + # PGP fingerprints + for match in re.finditer(RegexPatterns.PGP_FINGERPRINT, text): + fingerprint = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + # Check if context suggests this is a PGP fingerprint + pgp_keywords = ['pgp', 'fingerprint', 'key', 'gpg', 'encryption'] + if any(keyword in context.lower() for keyword in pgp_keywords): + results.append(ExtractionResult( + category='communication', + confidence=0.85, + risk_level='medium', + data={ + 'type': 'pgp_fingerprint', + 'fingerprint': fingerprint + }, + context=context, + location=url + )) + + return results + + def _extract_messaging_ids(self, text: str, url: str) -> List[ExtractionResult]: + """Extract messaging app identifiers""" + results = [] + + # Telegram + for match in re.finditer(RegexPatterns.TELEGRAM, text): + telegram_id = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + # Clean up the ID + clean_id = telegram_id.replace('@', '').replace('t.me/', '') + + results.append(ExtractionResult( + category='communication', + confidence=0.9, + risk_level='medium', + data={ + 'type': 'telegram', + 'username': clean_id, + 'full_handle': telegram_id + }, + context=context, + location=url + )) + + # Wickr + for match in re.finditer(RegexPatterns.WICKR, text): + wickr_id = match.group(1) if match.lastindex else match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.85, + risk_level='medium', + data={ + 'type': 'wickr', + 'username': wickr_id + }, + context=context, + location=url + )) + + # Signal + signal_pattern = r'(?i)signal[:\s]+([+\d\s()-]{10,20})' + for match in re.finditer(signal_pattern, text): + signal_number = match.group(1).strip() + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.8, + risk_level='medium', + data={ + 'type': 'signal', + 'phone_number': signal_number + }, + context=context, + location=url + )) + + # Session + session_pattern = r'(?i)session\s+id[:\s]+([a-f0-9]{64,66})' + for match in re.finditer(session_pattern, text): + session_id = match.group(1) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.85, + risk_level='medium', + data={ + 'type': 'session', + 'session_id': session_id + }, + context=context, + location=url + )) + + # Jabber/XMPP + jabber_pattern = r'\b([a-zA-Z0-9._%+-]+@(?:[a-zA-Z0-9-]+\.)*xmpp\.[a-zA-Z]{2,}|[a-zA-Z0-9._%+-]+@jabber\.[a-zA-Z0-9.-]+)\b' + for match in re.finditer(jabber_pattern, text): + jabber_id = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.85, + risk_level='medium', + data={ + 'type': 'jabber', + 'jabber_id': jabber_id + }, + context=context, + location=url + )) + + # Discord + discord_pattern = r'(?i)discord[:\s]+([a-zA-Z0-9_]{2,32}#\d{4})' + for match in re.finditer(discord_pattern, text): + discord_id = match.group(1) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.9, + risk_level='low', + data={ + 'type': 'discord', + 'username': discord_id + }, + context=context, + location=url + )) + + # Matrix + matrix_pattern = r'@[a-zA-Z0-9._=-]+:[a-zA-Z0-9.-]+' + for match in re.finditer(matrix_pattern, text): + matrix_id = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + # Check if context suggests Matrix + if 'matrix' in context.lower(): + results.append(ExtractionResult( + category='communication', + confidence=0.8, + risk_level='medium', + data={ + 'type': 'matrix', + 'user_id': matrix_id + }, + context=context, + location=url + )) + + return results + + def _extract_contact_emails(self, text: str, url: str) -> List[ExtractionResult]: + """Extract email addresses used for contact""" + results = [] + + contact_patterns = [ + r'(?i)contact[:\s]+(' + RegexPatterns.EMAIL + r')', + r'(?i)email[:\s]+(' + RegexPatterns.EMAIL + r')', + r'(?i)reach\s+(?:me|us)\s+at[:\s]+(' + RegexPatterns.EMAIL + r')' + ] + + for pattern in contact_patterns: + for match in re.finditer(pattern, text): + email = match.group(1) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.9, + risk_level='medium', + data={ + 'type': 'contact_email', + 'email': email, + 'purpose': 'contact' + }, + context=context, + location=url + )) + + return results + + def _extract_irc_channels(self, text: str, url: str) -> List[ExtractionResult]: + """Extract IRC channels and servers""" + results = [] + + # IRC channel format + irc_channel_pattern = r'#[a-zA-Z0-9_-]{2,50}' + for match in re.finditer(irc_channel_pattern, text): + channel = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + # Check if context suggests IRC + irc_keywords = ['irc', 'channel', 'chat', 'server'] + if any(keyword in context.lower() for keyword in irc_keywords): + results.append(ExtractionResult( + category='communication', + confidence=0.75, + risk_level='low', + data={ + 'type': 'irc_channel', + 'channel': channel + }, + context=context, + location=url + )) + + # IRC server format + irc_server_pattern = r'(?i)irc[:\s]+([a-zA-Z0-9.-]+(?::\d+)?)' + for match in re.finditer(irc_server_pattern, text): + server = match.group(1) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='communication', + confidence=0.8, + risk_level='low', + data={ + 'type': 'irc_server', + 'server': server + }, + context=context, + location=url + )) + + return results + diff --git a/src/torbot/modules/deep_extract/credentials_extractor.py b/src/torbot/modules/deep_extract/credentials_extractor.py new file mode 100644 index 00000000..192bb654 --- /dev/null +++ b/src/torbot/modules/deep_extract/credentials_extractor.py @@ -0,0 +1,302 @@ +""" +Credential and Authentication Data Extractor +""" + +import re +from typing import List +from .base import BaseExtractor, ExtractionResult, RegexPatterns + + +class CredentialsExtractor(BaseExtractor): + """Extract credentials and authentication data from content""" + + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: + """Extract various types of credentials""" + results = [] + + # Extract username:password pairs + results.extend(self._extract_username_password_pairs(text, url)) + + # Extract API keys + results.extend(self._extract_api_keys(text, url)) + + # Extract JWT tokens + results.extend(self._extract_jwt_tokens(text, url)) + + # Extract password hashes + results.extend(self._extract_password_hashes(text, url)) + + # Extract session tokens + results.extend(self._extract_session_tokens(text, url)) + + self.results.extend(results) + return results + + def _extract_username_password_pairs(self, text: str, url: str) -> List[ExtractionResult]: + """Extract username:password combinations""" + results = [] + + # Pattern 1: username:password format + pattern1 = r'(?m)^([a-zA-Z0-9._%+-]+):([^\s:]+)$' + for match in re.finditer(pattern1, text): + username, password = match.groups() + + # Skip if it looks like a URL or ratio + if '/' in username or '/' in password: + continue + + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.85, + risk_level='critical', + data={ + 'type': 'username_password', + 'username': username, + 'password': password, + 'format': 'username:password' + }, + context=context, + location=url + )) + + # Pattern 2: email:password format + pattern2 = r'(' + RegexPatterns.EMAIL + r'):([^\s:]+)' + for match in re.finditer(pattern2, text): + email = match.group(1) + password = match.group(2) + + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.9, + risk_level='critical', + data={ + 'type': 'email_password', + 'email': email, + 'password': password, + 'format': 'email:password' + }, + context=context, + location=url + )) + + # Pattern 3: Labeled credentials + pattern3 = r'(?i)(?:username|user|login|email)[\s:=]+([^\s:]+)[\s\n\r]{0,10}(?:password|pass|pwd)[\s:=]+([^\s\n\r]+)' + for match in re.finditer(pattern3, text): + username = match.group(1).strip() + password = match.group(2).strip() + + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.8, + risk_level='critical', + data={ + 'type': 'labeled_credentials', + 'username': username, + 'password': password, + 'format': 'labeled' + }, + context=context, + location=url + )) + + return results + + def _extract_api_keys(self, text: str, url: str) -> List[ExtractionResult]: + """Extract API keys and tokens""" + results = [] + + # AWS Access Keys + for match in re.finditer(RegexPatterns.API_KEY_AWS, text): + key = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.95, + risk_level='critical', + data={ + 'type': 'aws_access_key', + 'key': key, + 'provider': 'AWS' + }, + context=context, + location=url + )) + + # GitHub tokens + github_pattern = r'\bgh[pousr]_[A-Za-z0-9_]{36,}\b' + for match in re.finditer(github_pattern, text): + key = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.95, + risk_level='critical', + data={ + 'type': 'github_token', + 'key': key, + 'provider': 'GitHub' + }, + context=context, + location=url + )) + + # Slack tokens + slack_pattern = r'\bxox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24,}\b' + for match in re.finditer(slack_pattern, text): + key = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.95, + risk_level='critical', + data={ + 'type': 'slack_token', + 'key': key, + 'provider': 'Slack' + }, + context=context, + location=url + )) + + # Generic API keys (look for common keywords) + api_key_patterns = [ + r'(?i)api[_-]?key[\s:=]+([a-zA-Z0-9_-]{20,})', + r'(?i)apikey[\s:=]+([a-zA-Z0-9_-]{20,})', + r'(?i)access[_-]?token[\s:=]+([a-zA-Z0-9_-]{20,})', + ] + + for pattern in api_key_patterns: + for match in re.finditer(pattern, text): + key = match.group(1) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.7, + risk_level='high', + data={ + 'type': 'generic_api_key', + 'key': key, + 'provider': 'Unknown' + }, + context=context, + location=url + )) + + return results + + def _extract_jwt_tokens(self, text: str, url: str) -> List[ExtractionResult]: + """Extract JWT tokens""" + results = [] + + for match in re.finditer(RegexPatterns.JWT_TOKEN, text): + token = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.9, + risk_level='high', + data={ + 'type': 'jwt_token', + 'token': token[:50] + '...' if len(token) > 50 else token + }, + context=context, + location=url + )) + + return results + + def _extract_password_hashes(self, text: str, url: str) -> List[ExtractionResult]: + """Extract password hashes""" + results = [] + + # Look for hashes in common formats + hash_patterns = [ + (RegexPatterns.MD5, 'MD5', 0.6), + (RegexPatterns.SHA1, 'SHA1', 0.65), + (RegexPatterns.SHA256, 'SHA256', 0.7), + ] + + for pattern, hash_type, confidence in hash_patterns: + # Look for hashes with password-related context + for match in re.finditer(pattern, text): + hash_value = match.group(0) + context = self.get_context(text, match.start(), match.end(), 200) + + # Check if context suggests this is a password hash + password_keywords = ['password', 'passwd', 'pwd', 'hash', 'credential'] + if any(keyword in context.lower() for keyword in password_keywords): + results.append(ExtractionResult( + category='credentials', + confidence=confidence + 0.2, + risk_level='high', + data={ + 'type': 'password_hash', + 'hash_type': hash_type, + 'hash': hash_value + }, + context=context, + location=url + )) + + # Bcrypt hashes + bcrypt_pattern = r'\$2[ayb]\$[0-9]{2}\$[A-Za-z0-9./]{53}' + for match in re.finditer(bcrypt_pattern, text): + hash_value = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.95, + risk_level='high', + data={ + 'type': 'password_hash', + 'hash_type': 'bcrypt', + 'hash': hash_value + }, + context=context, + location=url + )) + + return results + + def _extract_session_tokens(self, text: str, url: str) -> List[ExtractionResult]: + """Extract session identifiers and tokens""" + results = [] + + session_patterns = [ + r'(?i)session[_-]?id[\s:=]+([a-zA-Z0-9_-]{20,})', + r'(?i)phpsessid=([a-zA-Z0-9]{26,})', + r'(?i)jsessionid=([a-zA-Z0-9]{32,})', + r'(?i)asp\.net_sessionid=([a-zA-Z0-9]{24,})', + ] + + for pattern in session_patterns: + for match in re.finditer(pattern, text): + session_id = match.group(1) if match.lastindex else match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='credentials', + confidence=0.75, + risk_level='medium', + data={ + 'type': 'session_token', + 'session_id': session_id + }, + context=context, + location=url + )) + + return results + diff --git a/src/torbot/modules/deep_extract/crypto_extractor.py b/src/torbot/modules/deep_extract/crypto_extractor.py new file mode 100644 index 00000000..1e6bbcd2 --- /dev/null +++ b/src/torbot/modules/deep_extract/crypto_extractor.py @@ -0,0 +1,228 @@ +""" +Cryptocurrency Address Extractor and Tracker +""" + +import re +from typing import List +from .base import BaseExtractor, ExtractionResult, RegexPatterns + + +class CryptoExtractor(BaseExtractor): + """Extract cryptocurrency addresses and related information""" + + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: + """Extract various cryptocurrency addresses""" + results = [] + + # Extract Bitcoin addresses + results.extend(self._extract_bitcoin(text, url)) + + # Extract Ethereum addresses + results.extend(self._extract_ethereum(text, url)) + + # Extract Monero addresses + results.extend(self._extract_monero(text, url)) + + # Extract Litecoin addresses + results.extend(self._extract_litecoin(text, url)) + + # Extract other cryptocurrency mentions + results.extend(self._extract_crypto_keywords(text, url)) + + self.results.extend(results) + return results + + def _extract_bitcoin(self, text: str, url: str) -> List[ExtractionResult]: + """Extract Bitcoin addresses""" + results = [] + + for match in re.finditer(RegexPatterns.BITCOIN, text): + address = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + # Validate Bitcoin address format more strictly + if self._validate_bitcoin_address(address): + results.append(ExtractionResult( + category='cryptocurrency', + confidence=0.9, + risk_level='high', + data={ + 'type': 'bitcoin', + 'address': address, + 'currency': 'BTC', + 'address_type': self._get_bitcoin_type(address) + }, + context=context, + location=url + )) + + return results + + def _validate_bitcoin_address(self, address: str) -> bool: + """Validate Bitcoin address format""" + # Basic validation - starts with 1, 3, or bc1 + if not (address.startswith('1') or address.startswith('3') or address.startswith('bc1')): + return False + + # Length check + if address.startswith('bc1'): # Bech32 + return 42 <= len(address) <= 62 + else: # Base58 + return 26 <= len(address) <= 35 + + return True + + def _get_bitcoin_type(self, address: str) -> str: + """Determine Bitcoin address type""" + if address.startswith('1'): + return 'P2PKH (Legacy)' + elif address.startswith('3'): + return 'P2SH (SegWit)' + elif address.startswith('bc1'): + return 'Bech32 (Native SegWit)' + return 'Unknown' + + def _extract_ethereum(self, text: str, url: str) -> List[ExtractionResult]: + """Extract Ethereum addresses""" + results = [] + + for match in re.finditer(RegexPatterns.ETHEREUM, text): + address = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='cryptocurrency', + confidence=0.9, + risk_level='high', + data={ + 'type': 'ethereum', + 'address': address, + 'currency': 'ETH', + 'checksum_validated': False # Could add EIP-55 validation + }, + context=context, + location=url + )) + + return results + + def _extract_monero(self, text: str, url: str) -> List[ExtractionResult]: + """Extract Monero addresses""" + results = [] + + for match in re.finditer(RegexPatterns.MONERO, text): + address = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='cryptocurrency', + confidence=0.85, + risk_level='high', + data={ + 'type': 'monero', + 'address': address, + 'currency': 'XMR', + 'privacy_coin': True + }, + context=context, + location=url + )) + + return results + + def _extract_litecoin(self, text: str, url: str) -> List[ExtractionResult]: + """Extract Litecoin addresses""" + results = [] + + for match in re.finditer(RegexPatterns.LITECOIN, text): + address = match.group(0) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='cryptocurrency', + confidence=0.8, + risk_level='high', + data={ + 'type': 'litecoin', + 'address': address, + 'currency': 'LTC' + }, + context=context, + location=url + )) + + return results + + def _extract_crypto_keywords(self, text: str, url: str) -> List[ExtractionResult]: + """Extract cryptocurrency-related keywords and contexts""" + results = [] + + # Payment request patterns + payment_patterns = [ + (r'(?i)send\s+(\d+(?:\.\d+)?)\s*(btc|bitcoin|eth|ethereum|xmr|monero)', 0.7), + (r'(?i)price[:\s]+(\d+(?:\.\d+)?)\s*(btc|bitcoin|eth|ethereum|xmr|monero)', 0.75), + (r'(?i)payment[:\s]+(\d+(?:\.\d+)?)\s*(btc|bitcoin|eth|ethereum|xmr|monero)', 0.8), + ] + + for pattern, confidence in payment_patterns: + for match in re.finditer(pattern, text): + amount = match.group(1) + currency = match.group(2) + context = self.get_context(text, match.start(), match.end(), 150) + + results.append(ExtractionResult( + category='cryptocurrency', + confidence=confidence, + risk_level='medium', + data={ + 'type': 'payment_request', + 'amount': amount, + 'currency': currency.upper(), + }, + context=context, + location=url + )) + + # Wallet mentions + wallet_pattern = r'(?i)wallet[:\s]+([a-zA-Z0-9]{20,})' + for match in re.finditer(wallet_pattern, text): + wallet_id = match.group(1) + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='cryptocurrency', + confidence=0.6, + risk_level='medium', + data={ + 'type': 'wallet_mention', + 'wallet_id': wallet_id + }, + context=context, + location=url + )) + + # Exchange mentions + exchanges = [ + 'binance', 'coinbase', 'kraken', 'bitstamp', 'bitfinex', + 'huobi', 'okex', 'kucoin', 'gemini', 'bittrex' + ] + + for exchange in exchanges: + pattern = r'\b' + exchange + r'\b' + for match in re.finditer(pattern, text, re.IGNORECASE): + context = self.get_context(text, match.start(), match.end()) + + results.append(ExtractionResult( + category='cryptocurrency', + confidence=0.5, + risk_level='low', + data={ + 'type': 'exchange_mention', + 'exchange': exchange.title() + }, + context=context, + location=url + )) + + return results + diff --git a/src/torbot/modules/deep_extract/hidden_services_extractor.py b/src/torbot/modules/deep_extract/hidden_services_extractor.py new file mode 100644 index 00000000..2e02533f --- /dev/null +++ b/src/torbot/modules/deep_extract/hidden_services_extractor.py @@ -0,0 +1,206 @@ +""" +Hidden Services Intelligence Extractor +""" + +import re +from typing import List, Dict +from .base import BaseExtractor, ExtractionResult, RegexPatterns + + +class HiddenServicesExtractor(BaseExtractor): + """Extract and classify hidden service (.onion) links""" + + # Service type keywords for classification + SERVICE_KEYWORDS = { + 'marketplace': ['market', 'shop', 'store', 'buy', 'sell', 'vendor', 'product', 'cart', 'price'], + 'forum': ['forum', 'board', 'discussion', 'thread', 'post', 'reply', 'topic', 'community'], + 'hosting': ['host', 'hosting', 'server', 'vps', 'dedicated', 'upload', 'file'], + 'email': ['mail', 'email', 'inbox', 'message', 'webmail'], + 'wiki': ['wiki', 'encyclopedia', 'article', 'knowledge'], + 'blog': ['blog', 'news', 'article', 'post'], + 'paste': ['paste', 'pastebin', 'snippet'], + 'search': ['search', 'index', 'directory', 'engine'], + 'chat': ['chat', 'irc', 'messenger', 'talk'], + 'financial': ['bank', 'bitcoin', 'crypto', 'wallet', 'exchange', 'atm'], + 'social': ['social', 'network', 'profile', 'friend'], + 'darknet': ['darknet', 'deep web', 'anonymous', 'privacy'], + } + + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: + """Extract hidden service links and information""" + results = [] + + # Extract v2 onion links + results.extend(self._extract_onion_links(text, url, RegexPatterns.ONION_V2, 'v2')) + + # Extract v3 onion links + results.extend(self._extract_onion_links(text, url, RegexPatterns.ONION_V3, 'v3')) + + # Extract service descriptions and metadata + results.extend(self._extract_service_metadata(text, url)) + + self.results.extend(results) + return results + + def _extract_onion_links(self, text: str, url: str, pattern: str, version: str) -> List[ExtractionResult]: + """Extract onion links with classification""" + results = [] + + for match in re.finditer(pattern, text): + onion_address = match.group(0) + context = self.get_context(text, match.start(), match.end(), 200) + + # Classify service type based on context + service_type, confidence_adjustment = self._classify_service(context) + + # Calculate trust score (basic heuristic) + trust_score = self._calculate_trust_score(context, onion_address) + + base_confidence = 0.95 if version == 'v3' else 0.9 + + results.append(ExtractionResult( + category='hidden_services', + confidence=base_confidence + confidence_adjustment, + risk_level=self._determine_risk_level(service_type), + data={ + 'type': 'onion_link', + 'address': onion_address, + 'full_url': f'http://{onion_address}', + 'version': version, + 'service_type': service_type, + 'trust_score': trust_score + }, + context=context, + location=url + )) + + return results + + def _classify_service(self, context: str) -> tuple: + """Classify hidden service based on context""" + context_lower = context.lower() + + # Count keyword matches for each category + category_scores = {} + for category, keywords in self.SERVICE_KEYWORDS.items(): + score = sum(1 for keyword in keywords if keyword in context_lower) + if score > 0: + category_scores[category] = score + + if not category_scores: + return 'unknown', 0.0 + + # Get category with highest score + best_category = max(category_scores, key=category_scores.get) + max_score = category_scores[best_category] + + # Confidence adjustment based on match strength + confidence_adjustment = min(0.1 * max_score, 0.3) + + return best_category, confidence_adjustment + + def _calculate_trust_score(self, context: str, address: str) -> float: + """Calculate basic trust score for hidden service""" + score = 0.5 # Base score + + context_lower = context.lower() + + # Positive indicators + positive_keywords = ['verified', 'trusted', 'official', 'secure', 'reputation', 'reviews'] + score += 0.05 * sum(1 for keyword in positive_keywords if keyword in context_lower) + + # Negative indicators + negative_keywords = ['scam', 'fake', 'phishing', 'warning', 'unsafe', 'malware', 'virus'] + score -= 0.1 * sum(1 for keyword in negative_keywords if keyword in context_lower) + + # Clamp score between 0 and 1 + return max(0.0, min(1.0, score)) + + def _determine_risk_level(self, service_type: str) -> str: + """Determine risk level based on service type""" + high_risk_types = ['marketplace', 'financial', 'darknet'] + medium_risk_types = ['forum', 'chat', 'paste', 'hosting'] + + if service_type in high_risk_types: + return 'high' + elif service_type in medium_risk_types: + return 'medium' + else: + return 'low' + + def _extract_service_metadata(self, text: str, url: str) -> List[ExtractionResult]: + """Extract hidden service metadata and descriptions""" + results = [] + + # Extract service titles + title_patterns = [ + r'