|
| 1 | +""" |
| 2 | +Base classes and utilities for deep content extraction |
| 3 | +""" |
| 4 | + |
| 5 | +import re |
| 6 | +from typing import List, Dict, Any, Optional |
| 7 | +from dataclasses import dataclass, field |
| 8 | +from abc import ABC, abstractmethod |
| 9 | +from datetime import datetime |
| 10 | + |
| 11 | + |
| 12 | +@dataclass |
| 13 | +class ExtractionResult: |
| 14 | + """Container for extracted intelligence data""" |
| 15 | + |
| 16 | + category: str # Type of extraction (credentials, pii, crypto, etc.) |
| 17 | + confidence: float # Confidence score (0.0 to 1.0) |
| 18 | + risk_level: str # low, medium, high, critical |
| 19 | + data: Dict[str, Any] # The actual extracted data |
| 20 | + context: Optional[str] = None # Surrounding context |
| 21 | + location: Optional[str] = None # Location in page (URL, line number, etc.) |
| 22 | + timestamp: datetime = field(default_factory=datetime.now) |
| 23 | + |
| 24 | + def to_dict(self) -> Dict[str, Any]: |
| 25 | + """Convert to dictionary for JSON serialization""" |
| 26 | + return { |
| 27 | + 'category': self.category, |
| 28 | + 'confidence': self.confidence, |
| 29 | + 'risk_level': self.risk_level, |
| 30 | + 'data': self.data, |
| 31 | + 'context': self.context, |
| 32 | + 'location': self.location, |
| 33 | + 'timestamp': self.timestamp.isoformat() |
| 34 | + } |
| 35 | + |
| 36 | + |
| 37 | +class BaseExtractor(ABC): |
| 38 | + """Base class for all content extractors""" |
| 39 | + |
| 40 | + def __init__(self): |
| 41 | + self.results: List[ExtractionResult] = [] |
| 42 | + |
| 43 | + @abstractmethod |
| 44 | + def extract(self, text: str, url: str = "") -> List[ExtractionResult]: |
| 45 | + """ |
| 46 | + Extract intelligence from text content |
| 47 | + |
| 48 | + Args: |
| 49 | + text: The text content to analyze |
| 50 | + url: The source URL (optional) |
| 51 | + |
| 52 | + Returns: |
| 53 | + List of ExtractionResult objects |
| 54 | + """ |
| 55 | + pass |
| 56 | + |
| 57 | + def get_context(self, text: str, match_start: int, match_end: int, |
| 58 | + context_chars: int = 100) -> str: |
| 59 | + """ |
| 60 | + Extract surrounding context for a match |
| 61 | + |
| 62 | + Args: |
| 63 | + text: Full text content |
| 64 | + match_start: Start position of match |
| 65 | + match_end: End position of match |
| 66 | + context_chars: Number of characters to include on each side |
| 67 | + |
| 68 | + Returns: |
| 69 | + Context string |
| 70 | + """ |
| 71 | + start = max(0, match_start - context_chars) |
| 72 | + end = min(len(text), match_end + context_chars) |
| 73 | + context = text[start:end] |
| 74 | + |
| 75 | + # Clean up context |
| 76 | + context = context.replace('\n', ' ').replace('\r', ' ') |
| 77 | + context = re.sub(r'\s+', ' ', context).strip() |
| 78 | + |
| 79 | + return context |
| 80 | + |
| 81 | + def calculate_risk_level(self, data_type: str, confidence: float) -> str: |
| 82 | + """ |
| 83 | + Calculate risk level based on data type and confidence |
| 84 | + |
| 85 | + Args: |
| 86 | + data_type: Type of sensitive data found |
| 87 | + confidence: Confidence score |
| 88 | + |
| 89 | + Returns: |
| 90 | + Risk level string |
| 91 | + """ |
| 92 | + critical_types = ['password', 'ssn', 'credit_card', 'api_key', 'private_key'] |
| 93 | + high_types = ['email', 'phone', 'bitcoin', 'credential_dump'] |
| 94 | + medium_types = ['onion_link', 'ip_address', 'hash'] |
| 95 | + |
| 96 | + if data_type in critical_types and confidence > 0.7: |
| 97 | + return 'critical' |
| 98 | + elif data_type in high_types and confidence > 0.6: |
| 99 | + return 'high' |
| 100 | + elif data_type in medium_types and confidence > 0.5: |
| 101 | + return 'medium' |
| 102 | + else: |
| 103 | + return 'low' |
| 104 | + |
| 105 | + |
| 106 | +class RegexPatterns: |
| 107 | + """Common regex patterns for extraction""" |
| 108 | + |
| 109 | + # Email patterns |
| 110 | + EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' |
| 111 | + |
| 112 | + # Cryptocurrency addresses |
| 113 | + BITCOIN = r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b' |
| 114 | + ETHEREUM = r'\b0x[a-fA-F0-9]{40}\b' |
| 115 | + MONERO = r'\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b' |
| 116 | + LITECOIN = r'\b[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}\b' |
| 117 | + |
| 118 | + # Onion links |
| 119 | + ONION_V2 = r'\b[a-z2-7]{16}\.onion\b' |
| 120 | + ONION_V3 = r'\b[a-z2-7]{56}\.onion\b' |
| 121 | + |
| 122 | + # Network indicators |
| 123 | + IPV4 = r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b' |
| 124 | + IPV6 = r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b' |
| 125 | + DOMAIN = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b' |
| 126 | + |
| 127 | + # PII |
| 128 | + PHONE = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b' |
| 129 | + SSN = r'\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b' |
| 130 | + CREDIT_CARD = r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12})\b' |
| 131 | + |
| 132 | + # Credentials |
| 133 | + USERNAME_PASSWORD = r'(?i)(?:username|user|login|email)[\s:=]+([^\s:]+)[\s\n\r]*(?:password|pass|pwd)[\s:=]+([^\s\n\r]+)' |
| 134 | + API_KEY_AWS = r'\b(?:AKIA|ASIA)[0-9A-Z]{16}\b' |
| 135 | + API_KEY_GENERIC = r'\b[a-zA-Z0-9_-]{32,}\b' |
| 136 | + JWT_TOKEN = r'\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b' |
| 137 | + |
| 138 | + # Hashes |
| 139 | + MD5 = r'\b[a-fA-F0-9]{32}\b' |
| 140 | + SHA1 = r'\b[a-fA-F0-9]{40}\b' |
| 141 | + SHA256 = r'\b[a-fA-F0-9]{64}\b' |
| 142 | + |
| 143 | + # Communication |
| 144 | + PGP_KEY = r'-----BEGIN PGP (?:PUBLIC|PRIVATE) KEY BLOCK-----' |
| 145 | + PGP_FINGERPRINT = r'\b[0-9A-F]{40}\b' |
| 146 | + JABBER = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\s|$)' |
| 147 | + TELEGRAM = r'(?:@|t\.me/)[a-zA-Z0-9_]{5,32}' |
| 148 | + WICKR = r'(?i)wickr(?:\s*:?\s*|me\s*:?\s*)([a-zA-Z0-9_-]{5,20})' |
| 149 | + |
| 150 | + # CVE |
| 151 | + CVE = r'\bCVE-\d{4}-\d{4,7}\b' |
| 152 | + |
| 153 | + |
| 154 | +class LuhnValidator: |
| 155 | + """Luhn algorithm for credit card validation""" |
| 156 | + |
| 157 | + @staticmethod |
| 158 | + def validate(number: str) -> bool: |
| 159 | + """ |
| 160 | + Validate credit card number using Luhn algorithm |
| 161 | + |
| 162 | + Args: |
| 163 | + number: Credit card number string |
| 164 | + |
| 165 | + Returns: |
| 166 | + True if valid, False otherwise |
| 167 | + """ |
| 168 | + try: |
| 169 | + # Remove any spaces or dashes |
| 170 | + number = number.replace(' ', '').replace('-', '') |
| 171 | + |
| 172 | + if not number.isdigit(): |
| 173 | + return False |
| 174 | + |
| 175 | + # Luhn algorithm |
| 176 | + total = 0 |
| 177 | + reverse_digits = number[::-1] |
| 178 | + |
| 179 | + for i, digit in enumerate(reverse_digits): |
| 180 | + n = int(digit) |
| 181 | + if i % 2 == 1: |
| 182 | + n *= 2 |
| 183 | + if n > 9: |
| 184 | + n -= 9 |
| 185 | + total += n |
| 186 | + |
| 187 | + return total % 10 == 0 |
| 188 | + except: |
| 189 | + return False |
| 190 | + |
0 commit comments