Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 76 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from torbot.modules.updater import check_version
from torbot.modules.info import execute_all, fetch_html
from torbot.modules.linktree import LinkTree
from torbot.modules.deep_extract import DeepExtractor


def print_tor_ip_address(client: httpx.Client) -> None:
Expand Down Expand Up @@ -52,6 +53,63 @@ def print_header(version: str) -> None:
print(title)


def handle_deep_extraction(tree: LinkTree, client: httpx.Client, export_path: str = None) -> None:
"""
Handle deep content extraction from crawled pages.

Args:
tree: LinkTree object with crawled URLs
client: HTTP client for making requests
export_path: Optional path to export intelligence data
"""
logging.info("Starting deep content extraction...")
deep_extractor = DeepExtractor()

# Extract content from each page in the tree
pages_analyzed = 0
for node_url in tree.nodes:
try:
logging.debug(f"Extracting from: {node_url}")
response = client.get(node_url)
if response.status_code == 200:
deep_extractor.extract_all(response.text, node_url)
pages_analyzed += 1
except Exception as e:
logging.warning(f"Could not extract from {node_url}: {str(e)}")

logging.info(f"Deep extraction complete. Analyzed {pages_analyzed} pages.")

# Print summary
deep_extractor.print_summary()

# Export to JSON if requested
if export_path:
logging.info(f"Exporting intelligence to {export_path}...")
deep_extractor.export_to_json(export_path)

# Also create a text report
base_path = export_path.rsplit('.', 1)[0] if '.' in export_path else export_path
text_report_path = f"{base_path}_report.txt"
deep_extractor.export_to_text(text_report_path)
logging.info(f"Text report saved to {text_report_path}")


def handle_visualization(tree: LinkTree, visualize_mode: str = None) -> None:
"""
Handle visualization of crawled data.

Args:
tree: LinkTree object with crawled data
visualize_mode: Visualization mode (table, tree, json)
"""
if visualize_mode == "table" or not visualize_mode:
tree.showTable()
elif visualize_mode == "tree":
print(tree)
elif visualize_mode == "json":
tree.showJSON()


def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
args = arg_parser.parse_args()

Expand All @@ -66,7 +124,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
arg_parser.print_help()
sys.exit()

# Print verison then exit
# Print version then exit
if args.version:
print(f"TorBot Version: {version}")
sys.exit()
Expand All @@ -93,6 +151,10 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
tree = LinkTree(url=args.url, depth=args.depth, client=client)
tree.load()

# Deep extraction if requested
if args.deep_extract:
handle_deep_extraction(tree, client, args.export_intel)

# save data if desired
if args.save == "tree":
tree.save()
Expand All @@ -105,12 +167,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
fetch_html(client, args.url, tree, save_html=True)

# always print something, table is the default
if args.visualize == "table" or not args.visualize:
tree.showTable()
elif args.visualize == "tree":
print(tree)
elif args.visualize == "json":
tree.showJSON()
handle_visualization(tree, args.visualize)

print("\n\n")

Expand All @@ -123,10 +180,10 @@ def set_arguments() -> argparse.ArgumentParser:
prog="TorBot", usage="Gather and analayze data from Tor sites."
)
parser.add_argument(
"-u", "--url", type=str, required=True, help="Specifiy a website link to crawl"
"-u", "--url", type=str, required=True, help="Specify a website link to crawl"
)
parser.add_argument(
"--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1
"--depth", type=int, help="Specify max depth of crawler (default 1)", default=1
)
parser.add_argument(
"--host", type=str, help="IP address for SOCKS5 proxy", default="127.0.0.1"
Expand Down Expand Up @@ -162,9 +219,16 @@ def set_arguments() -> argparse.ArgumentParser:
help="Executes HTTP requests without using SOCKS5 proxy",
)
parser.add_argument(
"--html",
choices=["save", "display"],
help="Saves / Displays the html of the onion link",

"--deep-extract",
action="store_true",
help="Enable deep content extraction mode for OSINT intelligence gathering",
)
parser.add_argument(
"--export-intel",
type=str,
metavar="FILENAME",
help="Export extracted intelligence to JSON file (use with --deep-extract)",
)

return parser
Expand Down
17 changes: 1 addition & 16 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -256,19 +256,4 @@ urllib3==2.5.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
validators==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
--hash=sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a
yattag==1.15.1 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
--hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416

numpy~=1.24.4
beautifulsoup4~=4.11.1
sklearn~=0.0
scikit-learn~=1.3.0
httpx[socks]~=0.25.0
yattag~=1.15.1
termcolor~=1.1.0
python-dotenv~=0.20.0
Unipath~=1.1
validators~=0.20.0
phonenumbers~=8.13.22
tabulate~=0.9.0
treelib~=1.7.0
toml~=0.10.2
--hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416
12 changes: 12 additions & 0 deletions src/torbot/modules/deep_extract/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
Deep Web Content Extraction Module

This module provides comprehensive content extraction and intelligence gathering
capabilities for dark web OSINT investigations.
"""

from .orchestrator import DeepExtractor
from .base import BaseExtractor, ExtractionResult

__all__ = ['DeepExtractor', 'BaseExtractor', 'ExtractionResult']

190 changes: 190 additions & 0 deletions src/torbot/modules/deep_extract/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
Base classes and utilities for deep content extraction
"""

import re
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
from datetime import datetime


@dataclass
class ExtractionResult:
"""Container for extracted intelligence data"""

category: str # Type of extraction (credentials, pii, crypto, etc.)
confidence: float # Confidence score (0.0 to 1.0)
risk_level: str # low, medium, high, critical
data: Dict[str, Any] # The actual extracted data
context: Optional[str] = None # Surrounding context
location: Optional[str] = None # Location in page (URL, line number, etc.)
timestamp: datetime = field(default_factory=datetime.now)

def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON serialization"""
return {
'category': self.category,
'confidence': self.confidence,
'risk_level': self.risk_level,
'data': self.data,
'context': self.context,
'location': self.location,
'timestamp': self.timestamp.isoformat()
}


class BaseExtractor(ABC):
"""Base class for all content extractors"""

def __init__(self):
self.results: List[ExtractionResult] = []

@abstractmethod
def extract(self, text: str, url: str = "") -> List[ExtractionResult]:
"""
Extract intelligence from text content

Args:
text: The text content to analyze
url: The source URL (optional)

Returns:
List of ExtractionResult objects
"""
pass

def get_context(self, text: str, match_start: int, match_end: int,
context_chars: int = 100) -> str:
"""
Extract surrounding context for a match

Args:
text: Full text content
match_start: Start position of match
match_end: End position of match
context_chars: Number of characters to include on each side

Returns:
Context string
"""
start = max(0, match_start - context_chars)
end = min(len(text), match_end + context_chars)
context = text[start:end]

# Clean up context
context = context.replace('\n', ' ').replace('\r', ' ')
context = re.sub(r'\s+', ' ', context).strip()

return context

def calculate_risk_level(self, data_type: str, confidence: float) -> str:
"""
Calculate risk level based on data type and confidence

Args:
data_type: Type of sensitive data found
confidence: Confidence score

Returns:
Risk level string
"""
critical_types = ['password', 'ssn', 'credit_card', 'api_key', 'private_key']
high_types = ['email', 'phone', 'bitcoin', 'credential_dump']
medium_types = ['onion_link', 'ip_address', 'hash']

if data_type in critical_types and confidence > 0.7:
return 'critical'
elif data_type in high_types and confidence > 0.6:
return 'high'
elif data_type in medium_types and confidence > 0.5:
return 'medium'
else:
return 'low'


class RegexPatterns:
"""Common regex patterns for extraction"""

# Email patterns
EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

# Cryptocurrency addresses
BITCOIN = r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b'
ETHEREUM = r'\b0x[a-fA-F0-9]{40}\b'
MONERO = r'\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b'
LITECOIN = r'\b[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}\b'

# Onion links
ONION_V2 = r'\b[a-z2-7]{16}\.onion\b'
ONION_V3 = r'\b[a-z2-7]{56}\.onion\b'

# Network indicators
IPV4 = r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
IPV6 = r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
DOMAIN = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b'

# PII
PHONE = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
SSN = r'\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b'
CREDIT_CARD = r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12})\b'

# Credentials
USERNAME_PASSWORD = r'(?i)(?:username|user|login|email)[\s:=]+([^\s:]+)[\s\n\r]*(?:password|pass|pwd)[\s:=]+([^\s\n\r]+)'
API_KEY_AWS = r'\b(?:AKIA|ASIA)[0-9A-Z]{16}\b'
API_KEY_GENERIC = r'\b[a-zA-Z0-9_-]{32,}\b'
JWT_TOKEN = r'\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b'

# Hashes
MD5 = r'\b[a-fA-F0-9]{32}\b'
SHA1 = r'\b[a-fA-F0-9]{40}\b'
SHA256 = r'\b[a-fA-F0-9]{64}\b'

# Communication
PGP_KEY = r'-----BEGIN PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----'
PGP_FINGERPRINT = r'\b[0-9A-F]{40}\b'
JABBER = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\s|$)'
TELEGRAM = r'(?:@|t\.me/)[a-zA-Z0-9_]{5,32}'
WICKR = r'(?i)wickr(?:\s*:?\s*|me\s*:?\s*)([a-zA-Z0-9_-]{5,20})'

# CVE
CVE = r'\bCVE-\d{4}-\d{4,7}\b'


class LuhnValidator:
"""Luhn algorithm for credit card validation"""

@staticmethod
def validate(number: str) -> bool:
"""
Validate credit card number using Luhn algorithm

Args:
number: Credit card number string

Returns:
True if valid, False otherwise
"""
try:
# Remove any spaces or dashes
number = number.replace(' ', '').replace('-', '')

if not number.isdigit():
return False

# Luhn algorithm
total = 0
reverse_digits = number[::-1]

for i, digit in enumerate(reverse_digits):
n = int(digit)
if i % 2 == 1:
n *= 2
if n > 9:
n -= 9
total += n

return total % 10 == 0
except (ValueError, AttributeError):
return False

Loading