Skip to content

Commit 93ef0e5

Browse files
DarkWeb Extractor feature added
1 parent 2902e54 commit 93ef0e5

14 files changed

Lines changed: 3084 additions & 16 deletions

main.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from torbot.modules.updater import check_version
1313
from torbot.modules.info import execute_all
1414
from torbot.modules.linktree import LinkTree
15+
from torbot.modules.deep_extract import DeepExtractor
1516

1617

1718
def print_tor_ip_address(client: httpx.Client) -> None:
@@ -95,6 +96,38 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None:
9596
tree = LinkTree(url=args.url, depth=args.depth, client=client)
9697
tree.load()
9798

99+
# Deep extraction if requested
100+
if args.deep_extract:
101+
logging.info("Starting deep content extraction...")
102+
deep_extractor = DeepExtractor()
103+
104+
# Extract content from each page in the tree
105+
pages_analyzed = 0
106+
for node_url in tree.nodes:
107+
try:
108+
logging.debug(f"Extracting from: {node_url}")
109+
response = client.get(node_url)
110+
if response.status_code == 200:
111+
deep_extractor.extract_all(response.text, node_url)
112+
pages_analyzed += 1
113+
except Exception as e:
114+
logging.warning(f"Could not extract from {node_url}: {str(e)}")
115+
116+
logging.info(f"Deep extraction complete. Analyzed {pages_analyzed} pages.")
117+
118+
# Print summary
119+
deep_extractor.print_summary()
120+
121+
# Export to JSON if requested
122+
if args.export_intel:
123+
logging.info(f"Exporting intelligence to {args.export_intel}...")
124+
deep_extractor.export_to_json(args.export_intel)
125+
126+
# Also create a text report
127+
text_report_path = args.export_intel.replace('.json', '_report.txt')
128+
deep_extractor.export_to_text(text_report_path)
129+
logging.info(f"Text report saved to {text_report_path}")
130+
98131
# save data if desired
99132
if args.save == "tree":
100133
tree.save()
@@ -158,6 +191,17 @@ def set_arguments() -> argparse.ArgumentParser:
158191
action="store_true",
159192
help="Executes HTTP requests without using SOCKS5 proxy",
160193
)
194+
parser.add_argument(
195+
"--deep-extract",
196+
action="store_true",
197+
help="Enable deep content extraction mode for OSINT intelligence gathering",
198+
)
199+
parser.add_argument(
200+
"--export-intel",
201+
type=str,
202+
metavar="FILENAME",
203+
help="Export extracted intelligence to JSON file (use with --deep-extract)",
204+
)
161205

162206
return parser
163207

requirements.txt

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -256,19 +256,4 @@ urllib3==1.26.18 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
256256
validators==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
257257
--hash=sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a
258258
yattag==1.15.1 ; python_version >= "3.9" and python_full_version <= "3.11.4" \
259-
--hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416
260-
261-
numpy~=1.24.4
262-
beautifulsoup4~=4.11.1
263-
sklearn~=0.0
264-
scikit-learn~=1.3.0
265-
httpx[socks]~=0.25.0
266-
yattag~=1.15.1
267-
termcolor~=1.1.0
268-
python-dotenv~=0.20.0
269-
Unipath~=1.1
270-
validators~=0.20.0
271-
phonenumbers~=8.13.22
272-
tabulate~=0.9.0
273-
treelib~=1.7.0
274-
toml~=0.10.2
259+
--hash=sha256:960fa54be1229d96f43178133e0b195c003391fdc49ecdb6b69b7374db6be416
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""
2+
Deep Web Content Extraction Module
3+
4+
This module provides comprehensive content extraction and intelligence gathering
5+
capabilities for dark web OSINT investigations.
6+
"""
7+
8+
from .orchestrator import DeepExtractor
9+
from .base import BaseExtractor, ExtractionResult
10+
11+
__all__ = ['DeepExtractor', 'BaseExtractor', 'ExtractionResult']
12+
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
"""
2+
Base classes and utilities for deep content extraction
3+
"""
4+
5+
import re
6+
from typing import List, Dict, Any, Optional
7+
from dataclasses import dataclass, field
8+
from abc import ABC, abstractmethod
9+
from datetime import datetime
10+
11+
12+
@dataclass
13+
class ExtractionResult:
14+
"""Container for extracted intelligence data"""
15+
16+
category: str # Type of extraction (credentials, pii, crypto, etc.)
17+
confidence: float # Confidence score (0.0 to 1.0)
18+
risk_level: str # low, medium, high, critical
19+
data: Dict[str, Any] # The actual extracted data
20+
context: Optional[str] = None # Surrounding context
21+
location: Optional[str] = None # Location in page (URL, line number, etc.)
22+
timestamp: datetime = field(default_factory=datetime.now)
23+
24+
def to_dict(self) -> Dict[str, Any]:
25+
"""Convert to dictionary for JSON serialization"""
26+
return {
27+
'category': self.category,
28+
'confidence': self.confidence,
29+
'risk_level': self.risk_level,
30+
'data': self.data,
31+
'context': self.context,
32+
'location': self.location,
33+
'timestamp': self.timestamp.isoformat()
34+
}
35+
36+
37+
class BaseExtractor(ABC):
38+
"""Base class for all content extractors"""
39+
40+
def __init__(self):
41+
self.results: List[ExtractionResult] = []
42+
43+
@abstractmethod
44+
def extract(self, text: str, url: str = "") -> List[ExtractionResult]:
45+
"""
46+
Extract intelligence from text content
47+
48+
Args:
49+
text: The text content to analyze
50+
url: The source URL (optional)
51+
52+
Returns:
53+
List of ExtractionResult objects
54+
"""
55+
pass
56+
57+
def get_context(self, text: str, match_start: int, match_end: int,
58+
context_chars: int = 100) -> str:
59+
"""
60+
Extract surrounding context for a match
61+
62+
Args:
63+
text: Full text content
64+
match_start: Start position of match
65+
match_end: End position of match
66+
context_chars: Number of characters to include on each side
67+
68+
Returns:
69+
Context string
70+
"""
71+
start = max(0, match_start - context_chars)
72+
end = min(len(text), match_end + context_chars)
73+
context = text[start:end]
74+
75+
# Clean up context
76+
context = context.replace('\n', ' ').replace('\r', ' ')
77+
context = re.sub(r'\s+', ' ', context).strip()
78+
79+
return context
80+
81+
def calculate_risk_level(self, data_type: str, confidence: float) -> str:
82+
"""
83+
Calculate risk level based on data type and confidence
84+
85+
Args:
86+
data_type: Type of sensitive data found
87+
confidence: Confidence score
88+
89+
Returns:
90+
Risk level string
91+
"""
92+
critical_types = ['password', 'ssn', 'credit_card', 'api_key', 'private_key']
93+
high_types = ['email', 'phone', 'bitcoin', 'credential_dump']
94+
medium_types = ['onion_link', 'ip_address', 'hash']
95+
96+
if data_type in critical_types and confidence > 0.7:
97+
return 'critical'
98+
elif data_type in high_types and confidence > 0.6:
99+
return 'high'
100+
elif data_type in medium_types and confidence > 0.5:
101+
return 'medium'
102+
else:
103+
return 'low'
104+
105+
106+
class RegexPatterns:
107+
"""Common regex patterns for extraction"""
108+
109+
# Email patterns
110+
EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
111+
112+
# Cryptocurrency addresses
113+
BITCOIN = r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b'
114+
ETHEREUM = r'\b0x[a-fA-F0-9]{40}\b'
115+
MONERO = r'\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b'
116+
LITECOIN = r'\b[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}\b'
117+
118+
# Onion links
119+
ONION_V2 = r'\b[a-z2-7]{16}\.onion\b'
120+
ONION_V3 = r'\b[a-z2-7]{56}\.onion\b'
121+
122+
# Network indicators
123+
IPV4 = r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
124+
IPV6 = r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
125+
DOMAIN = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b'
126+
127+
# PII
128+
PHONE = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
129+
SSN = r'\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b'
130+
CREDIT_CARD = r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12})\b'
131+
132+
# Credentials
133+
USERNAME_PASSWORD = r'(?i)(?:username|user|login|email)[\s:=]+([^\s:]+)[\s\n\r]*(?:password|pass|pwd)[\s:=]+([^\s\n\r]+)'
134+
API_KEY_AWS = r'\b(?:AKIA|ASIA)[0-9A-Z]{16}\b'
135+
API_KEY_GENERIC = r'\b[a-zA-Z0-9_-]{32,}\b'
136+
JWT_TOKEN = r'\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b'
137+
138+
# Hashes
139+
MD5 = r'\b[a-fA-F0-9]{32}\b'
140+
SHA1 = r'\b[a-fA-F0-9]{40}\b'
141+
SHA256 = r'\b[a-fA-F0-9]{64}\b'
142+
143+
# Communication
144+
PGP_KEY = r'-----BEGIN PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----'
145+
PGP_FINGERPRINT = r'\b[0-9A-F]{40}\b'
146+
JABBER = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\s|$)'
147+
TELEGRAM = r'(?:@|t\.me/)[a-zA-Z0-9_]{5,32}'
148+
WICKR = r'(?i)wickr(?:\s*:?\s*|me\s*:?\s*)([a-zA-Z0-9_-]{5,20})'
149+
150+
# CVE
151+
CVE = r'\bCVE-\d{4}-\d{4,7}\b'
152+
153+
154+
class LuhnValidator:
155+
"""Luhn algorithm for credit card validation"""
156+
157+
@staticmethod
158+
def validate(number: str) -> bool:
159+
"""
160+
Validate credit card number using Luhn algorithm
161+
162+
Args:
163+
number: Credit card number string
164+
165+
Returns:
166+
True if valid, False otherwise
167+
"""
168+
try:
169+
# Remove any spaces or dashes
170+
number = number.replace(' ', '').replace('-', '')
171+
172+
if not number.isdigit():
173+
return False
174+
175+
# Luhn algorithm
176+
total = 0
177+
reverse_digits = number[::-1]
178+
179+
for i, digit in enumerate(reverse_digits):
180+
n = int(digit)
181+
if i % 2 == 1:
182+
n *= 2
183+
if n > 9:
184+
n -= 9
185+
total += n
186+
187+
return total % 10 == 0
188+
except:
189+
return False
190+

0 commit comments

Comments
 (0)