DedSecInside · vaishcodescape · Oct 9, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/main.py b/main.py
@@ -12,6 +12,7 @@
 from torbot.modules.updater import check_version
 from torbot.modules.info import execute_all, fetch_html
 from torbot.modules.linktree import LinkTree
+from torbot.modules.deep_extract import DeepExtractor
 
 
 def print_tor_ip_address(client: httpx.Client) -> None:
@@ -52,6 +53,63 @@ def print_header(version: str) -> None:
     print(title)
 
 
+def handle_deep_extraction(tree: LinkTree, client: httpx.Client, export_path: str = None) -> None:
+    """
+    Handle deep content extraction from crawled pages.
+
+    Args:
+        tree: LinkTree object with crawled URLs
+        client: HTTP client for making requests
+        export_path: Optional path to export intelligence data
+    """
+    logging.info("Starting deep content extraction...")
+    deep_extractor = DeepExtractor()
+
+    # Extract content from each page in the tree
+    pages_analyzed = 0
+    for node_url in tree.nodes:
+        try:
+            logging.debug(f"Extracting from: {node_url}")
+            response = client.get(node_url)
+            if response.status_code == 200:
+                deep_extractor.extract_all(response.text, node_url)
+                pages_analyzed += 1
+        except Exception as e:
+            logging.warning(f"Could not extract from {node_url}: {str(e)}")
+
+    logging.info(f"Deep extraction complete. Analyzed {pages_analyzed} pages.")
+
+    # Print summary
+    deep_extractor.print_summary()
+
+    # Export to JSON if requested
+    if export_path:
+        logging.info(f"Exporting intelligence to {export_path}...")
+        deep_extractor.export_to_json(export_path)
+
+        # Also create a text report
+        base_path = export_path.rsplit('.', 1)[0] if '.' in export_path else export_path
+        text_report_path = f"{base_path}_report.txt"
+        deep_extractor.export_to_text(text_report_path)
+        logging.info(f"Text report saved to {text_report_path}")
+
+
+def handle_visualization(tree: LinkTree, visualize_mode: str = None) -> None:
+    """
+    Handle visualization of crawled data.
+
+    Args:
+        tree: LinkTree object with crawled data
+        visualize_mode: Visualization mode (table, tree, json)
+    """
+    if visualize_mode == "table" or not visualize_mode:
+        tree.showTable()
+    elif visualize_mode == "tree":
+        print(tree)
+    elif visualize_mode == "json":
+        tree.showJSON()
+
+
 def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
     args = arg_parser.parse_args()
 
@@ -66,7 +124,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
         arg_parser.print_help()
         sys.exit()
 
-    # Print verison then exit
+    # Print version then exit
     if args.version:
         print(f"TorBot Version: {version}")
         sys.exit()
@@ -93,6 +151,10 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
         tree = LinkTree(url=args.url, depth=args.depth, client=client)
         tree.load()
 
+        # Deep extraction if requested
+        if args.deep_extract:
+            handle_deep_extraction(tree, client, args.export_intel)
+
         # save data if desired
         if args.save == "tree":
             tree.save()
@@ -105,12 +167,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
             fetch_html(client, args.url, tree, save_html=True)
 
         # always print something, table is the default
-        if args.visualize == "table" or not args.visualize:
-            tree.showTable()
-        elif args.visualize == "tree":
-            print(tree)
-        elif args.visualize == "json":
-            tree.showJSON()
+        handle_visualization(tree, args.visualize)
 
     print("\n\n")
 
@@ -123,10 +180,10 @@ def set_arguments() -> argparse.ArgumentParser:
         prog="TorBot", usage="Gather and analayze data from Tor sites."
     )
     parser.add_argument(
-        "-u", "--url", type=str, required=True, help="Specifiy a website link to crawl"
+        "-u", "--url", type=str, required=True, help="Specify a website link to crawl"
     )
     parser.add_argument(
-        "--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1
+        "--depth", type=int, help="Specify max depth of crawler (default 1)", default=1
     )
     parser.add_argument(
         "--host", type=str, help="IP address for SOCKS5 proxy", default="127.0.0.1"
@@ -162,9 +219,16 @@ def set_arguments() -> argparse.ArgumentParser:
         help="Executes HTTP requests without using SOCKS5 proxy",
     )
     parser.add_argument(
-        "--html",
-        choices=["save", "display"],
-        help="Saves / Displays the html of the onion link",
+
+        "--deep-extract",
+        action="store_true",
+        help="Enable deep content extraction mode for OSINT intelligence gathering",
+    )
+    parser.add_argument(
+        "--export-intel",
+        type=str,
+        metavar="FILENAME",
+        help="Export extracted intelligence to JSON file (use with --deep-extract)",
     )
 
     return parser

diff --git a/requirements.txt b/requirements.txt
@@ -256,19 +256,4 @@ urllib3==2.5.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
 validators==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
     --hash=sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a
 yattag==1.15.1 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
-    --hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416
-
-numpy~=1.24.4
-beautifulsoup4~=4.11.1
-sklearn~=0.0
-scikit-learn~=1.3.0
-httpx[socks]~=0.25.0
-yattag~=1.15.1
-termcolor~=1.1.0
-python-dotenv~=0.20.0
-Unipath~=1.1
-validators~=0.20.0
-phonenumbers~=8.13.22
-tabulate~=0.9.0
-treelib~=1.7.0
-toml~=0.10.2
+    --hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416
diff --git a/src/torbot/modules/deep_extract/__init__.py b/src/torbot/modules/deep_extract/__init__.py
@@ -0,0 +1,12 @@
+"""
+Deep Web Content Extraction Module
+
+This module provides comprehensive content extraction and intelligence gathering
+capabilities for dark web OSINT investigations.
+"""
+
+from .orchestrator import DeepExtractor
+from .base import BaseExtractor, ExtractionResult
+
+__all__ = ['DeepExtractor', 'BaseExtractor', 'ExtractionResult']
+
diff --git a/src/torbot/modules/deep_extract/base.py b/src/torbot/modules/deep_extract/base.py
@@ -0,0 +1,190 @@
+"""
+Base classes and utilities for deep content extraction
+"""
+
+import re
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
+from datetime import datetime
+
+
+@dataclass
+class ExtractionResult:
+    """Container for extracted intelligence data"""
+
+    category: str  # Type of extraction (credentials, pii, crypto, etc.)
+    confidence: float  # Confidence score (0.0 to 1.0)
+    risk_level: str  # low, medium, high, critical
+    data: Dict[str, Any]  # The actual extracted data
+    context: Optional[str] = None  # Surrounding context
+    location: Optional[str] = None  # Location in page (URL, line number, etc.)
+    timestamp: datetime = field(default_factory=datetime.now)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            'category': self.category,
+            'confidence': self.confidence,
+            'risk_level': self.risk_level,
+            'data': self.data,
+            'context': self.context,
+            'location': self.location,
+            'timestamp': self.timestamp.isoformat()
+        }
+
+
+class BaseExtractor(ABC):
+    """Base class for all content extractors"""
+
+    def __init__(self):
+        self.results: List[ExtractionResult] = []
+
+    @abstractmethod
+    def extract(self, text: str, url: str = "") -> List[ExtractionResult]:
+        """
+        Extract intelligence from text content
+
+        Args:
+            text: The text content to analyze
+            url: The source URL (optional)
+
+        Returns:
+            List of ExtractionResult objects
+        """
+        pass
+
+    def get_context(self, text: str, match_start: int, match_end: int, 
+                   context_chars: int = 100) -> str:
+        """
+        Extract surrounding context for a match
+
+        Args:
+            text: Full text content
+            match_start: Start position of match
+            match_end: End position of match
+            context_chars: Number of characters to include on each side
+
+        Returns:
+            Context string
+        """
+        start = max(0, match_start - context_chars)
+        end = min(len(text), match_end + context_chars)
+        context = text[start:end]
+
+        # Clean up context
+        context = context.replace('\n', ' ').replace('\r', ' ')
+        context = re.sub(r'\s+', ' ', context).strip()
+
+        return context
+
+    def calculate_risk_level(self, data_type: str, confidence: float) -> str:
+        """
+        Calculate risk level based on data type and confidence
+
+        Args:
+            data_type: Type of sensitive data found
+            confidence: Confidence score
+
+        Returns:
+            Risk level string
+        """
+        critical_types = ['password', 'ssn', 'credit_card', 'api_key', 'private_key']
+        high_types = ['email', 'phone', 'bitcoin', 'credential_dump']
+        medium_types = ['onion_link', 'ip_address', 'hash']
+
+        if data_type in critical_types and confidence > 0.7:
+            return 'critical'
+        elif data_type in high_types and confidence > 0.6:
+            return 'high'
+        elif data_type in medium_types and confidence > 0.5:
+            return 'medium'
+        else:
+            return 'low'
+
+
+class RegexPatterns:
+    """Common regex patterns for extraction"""
+
+    # Email patterns
+    EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+
+    # Cryptocurrency addresses
+    BITCOIN = r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b'
+    ETHEREUM = r'\b0x[a-fA-F0-9]{40}\b'
+    MONERO = r'\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b'
+    LITECOIN = r'\b[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}\b'
+
+    # Onion links
+    ONION_V2 = r'\b[a-z2-7]{16}\.onion\b'
+    ONION_V3 = r'\b[a-z2-7]{56}\.onion\b'
+
+    # Network indicators
+    IPV4 = r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
+    IPV6 = r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
+    DOMAIN = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b'
+
+    # PII
+    PHONE = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
+    SSN = r'\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b'
+    CREDIT_CARD = r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12})\b'
+
+    # Credentials
+    USERNAME_PASSWORD = r'(?i)(?:username|user|login|email)[\s:=]+([^\s:]+)[\s\n\r]*(?:password|pass|pwd)[\s:=]+([^\s\n\r]+)'
+    API_KEY_AWS = r'\b(?:AKIA|ASIA)[0-9A-Z]{16}\b'
+    API_KEY_GENERIC = r'\b[a-zA-Z0-9_-]{32,}\b'
+    JWT_TOKEN = r'\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b'
+
+    # Hashes
+    MD5 = r'\b[a-fA-F0-9]{32}\b'
+    SHA1 = r'\b[a-fA-F0-9]{40}\b'
+    SHA256 = r'\b[a-fA-F0-9]{64}\b'
+
+    # Communication
+    PGP_KEY = r'-----BEGIN PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----'
+    PGP_FINGERPRINT = r'\b[0-9A-F]{40}\b'
+    JABBER = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\s|$)'
+    TELEGRAM = r'(?:@|t\.me/)[a-zA-Z0-9_]{5,32}'
+    WICKR = r'(?i)wickr(?:\s*:?\s*|me\s*:?\s*)([a-zA-Z0-9_-]{5,20})'
+
+    # CVE
+    CVE = r'\bCVE-\d{4}-\d{4,7}\b'
+
+
+class LuhnValidator:
+    """Luhn algorithm for credit card validation"""
+
+    @staticmethod
+    def validate(number: str) -> bool:
+        """
+        Validate credit card number using Luhn algorithm
+
+        Args:
+            number: Credit card number string
+
+        Returns:
+            True if valid, False otherwise
+        """
+        try:
+            # Remove any spaces or dashes
+            number = number.replace(' ', '').replace('-', '')
+
+            if not number.isdigit():
+                return False
+
+            # Luhn algorithm
+            total = 0
+            reverse_digits = number[::-1]
+
+            for i, digit in enumerate(reverse_digits):
+                n = int(digit)
+                if i % 2 == 1:
+                    n *= 2
+                    if n > 9:
+                        n -= 9
+                total += n
+
+            return total % 10 == 0
+        except (ValueError, AttributeError):
+            return False
+