app.analyzer.ioc
IOC extraction and prompt-building helpers for the forensic analyzer.
Extracts Indicators of Compromise (URLs, IPs, domains, hashes, emails, file paths, filenames, suspicious tool keywords) from investigation context text, and formats them into prompt sections for AI analysis.
Attributes:
- LOGGER: Module-level logger instance.
1"""IOC extraction and prompt-building helpers for the forensic analyzer. 2 3Extracts Indicators of Compromise (URLs, IPs, domains, hashes, emails, 4file paths, filenames, suspicious tool keywords) from investigation context 5text, and formats them into prompt sections for AI analysis. 6 7Attributes: 8 LOGGER: Module-level logger instance. 9""" 10 11from __future__ import annotations 12 13from .constants import ( 14 DOMAIN_EXCLUDED_SUFFIXES, 15 IOC_DOMAIN_RE, 16 IOC_EMAIL_RE, 17 IOC_FILENAME_RE, 18 IOC_HASH_RE, 19 IOC_IPV4_RE, 20 IOC_URL_RE, 21 KNOWN_MALICIOUS_TOOL_KEYWORDS, 22 WINDOWS_PATH_RE, 23) 24from .utils import ( 25 extract_url_host, 26 stringify_value, 27 truncate_for_prompt, 28 unique_preserve_order, 29) 30 31__all__ = [ 32 "extract_ioc_targets", 33 "format_ioc_targets", 34 "build_priority_directives", 35 "build_artifact_final_context_reminder", 36 "extract_tool_keywords", 37] 38 39 40def extract_tool_keywords(text: str) -> list[str]: 41 """Extract known malicious tool keyword matches from text. 42 43 Args: 44 text: Free-text string to scan for tool keywords. 45 46 Returns: 47 A deduplicated list of matched tool keyword strings, preserving 48 the order of first occurrence. 49 """ 50 lowered = text.lower() 51 hits: list[str] = [] 52 for keyword in KNOWN_MALICIOUS_TOOL_KEYWORDS: 53 if keyword in lowered: 54 hits.append(keyword) 55 return unique_preserve_order(hits) 56 57 58def extract_ioc_targets(investigation_context: str) -> dict[str, list[str]]: 59 """Extract Indicators of Compromise from investigation context text. 60 61 Uses regex patterns to identify URLs, IPv4 addresses, domains, 62 hashes (MD5/SHA1/SHA256), email addresses, Windows file paths, 63 executable filenames, and known malicious tool keywords. 64 65 Args: 66 investigation_context: Free-text investigation context string. 67 68 Returns: 69 A dict mapping IOC category names to deduplicated lists of 70 extracted values. Returns an empty dict if no IOCs are found. 71 """ 72 text = stringify_value(investigation_context) 73 if not text: 74 return {} 75 76 urls = unique_preserve_order(IOC_URL_RE.findall(text)) 77 ips = unique_preserve_order(IOC_IPV4_RE.findall(text)) 78 hashes = unique_preserve_order(IOC_HASH_RE.findall(text)) 79 emails = unique_preserve_order(IOC_EMAIL_RE.findall(text)) 80 windows_paths = unique_preserve_order(WINDOWS_PATH_RE.findall(text)) 81 file_names = unique_preserve_order(IOC_FILENAME_RE.findall(text)) 82 file_names_lower = {value.lower() for value in file_names} 83 tools = extract_tool_keywords(text) 84 85 domain_candidates = unique_preserve_order(IOC_DOMAIN_RE.findall(text)) 86 domains: list[str] = [] 87 url_hosts = {extract_url_host(url) for url in urls} 88 for domain in domain_candidates: 89 lowered = domain.lower() 90 if lowered in url_hosts: 91 continue 92 if lowered in file_names_lower: 93 continue 94 if any(lowered.endswith(suffix) for suffix in DOMAIN_EXCLUDED_SUFFIXES): 95 continue 96 domains.append(domain) 97 domains = unique_preserve_order(domains) 98 99 iocs: dict[str, list[str]] = {} 100 if urls: 101 iocs["URLs"] = urls 102 if ips: 103 iocs["IPv4"] = ips 104 if domains: 105 iocs["Domains"] = domains 106 if hashes: 107 iocs["Hashes"] = hashes 108 if emails: 109 iocs["Emails"] = emails 110 if windows_paths: 111 iocs["FilePaths"] = windows_paths 112 if file_names: 113 iocs["FileNames"] = file_names 114 if tools: 115 iocs["SuspiciousTools"] = tools 116 return iocs 117 118 119def format_ioc_targets(investigation_context: str) -> str: 120 """Format extracted IOC targets as a human-readable bullet list. 121 122 Args: 123 investigation_context: Free-text investigation context string. 124 125 Returns: 126 A multi-line string with one bullet per IOC category (up to 127 20 values each), or a message indicating no IOCs were found. 128 """ 129 ioc_map = extract_ioc_targets(investigation_context) 130 if not ioc_map: 131 return "No explicit IOC patterns were extracted from the investigation context." 132 133 lines = [] 134 for category, values in ioc_map.items(): 135 limited = values[:20] 136 suffix = "" if len(values) <= 20 else " ... [truncated]" 137 lines.append(f"- {category}: {', '.join(limited)}{suffix}") 138 return "\n".join(lines) 139 140 141def build_priority_directives(investigation_context: str) -> str: 142 """Build numbered priority directives for the AI analysis prompt. 143 144 Generates a set of directives that instruct the AI to prioritize 145 the user's investigation context, check IOCs, and run standard 146 DFIR checks. 147 148 Args: 149 investigation_context: Free-text investigation context string. 150 151 Returns: 152 A multi-line numbered list of priority directives. 153 """ 154 ioc_map = extract_ioc_targets(investigation_context) 155 has_iocs = bool(ioc_map) 156 lines = [ 157 "1. Treat the user investigation context as highest priority and address it before generic hunting.", 158 ( 159 "2. For each IOC listed below, explicitly classify it as Observed, Not Observed, or Not Assessable " 160 "in this artifact." 161 if has_iocs 162 else "2. No explicit IOC was extracted; still prioritize user-stated hypotheses and suspicious themes." 163 ), 164 "3. Always run default DFIR checks: privilege escalation, credential access tooling (including Mimikatz-like activity), persistence, defense evasion, lateral movement, and potential exfiltration.", 165 "4. Focus on evidence that improves triage or containment decisions; keep baseline/statistical context secondary.", 166 ] 167 return "\n".join(lines) 168 169 170def build_artifact_final_context_reminder( 171 artifact_key: str, 172 artifact_name: str, 173 investigation_context: str, 174) -> str: 175 """Build a short end-of-prompt reminder that survives left-side truncation. 176 177 Places critical context (artifact identity, investigation focus, IOC 178 targets, DFIR checks) at the very end of the prompt so that models 179 with left-side attention decay still see the most important instructions. 180 181 Args: 182 artifact_key: Unique identifier for the artifact. 183 artifact_name: Human-readable artifact name. 184 investigation_context: The user's investigation context text. 185 186 Returns: 187 A multi-line reminder section string starting with a Markdown 188 heading. 189 """ 190 context_text = stringify_value(investigation_context) 191 if context_text: 192 context_text = truncate_for_prompt(context_text, limit=1200) 193 else: 194 context_text = "No investigation context provided." 195 196 ioc_targets = format_ioc_targets(investigation_context) 197 ioc_targets = truncate_for_prompt(ioc_targets, limit=1200) 198 199 lines = [ 200 "## Final Context Reminder (Do Not Ignore)", 201 f"- Artifact key: {artifact_key}", 202 f"- Artifact name: {artifact_name}", 203 f"- Investigation context (mandatory): {context_text}", 204 f"- IOC targets (mandatory follow-through): {ioc_targets}", 205 "- Always run default DFIR checks: privilege escalation, credential-access/Mimikatz-like behavior, malicious program execution, persistence/evasion/lateral movement/exfiltration.", 206 "- If evidence is insufficient, mark IOC or DFIR check as Not Assessable.", 207 ] 208 return "\n".join(lines)
59def extract_ioc_targets(investigation_context: str) -> dict[str, list[str]]: 60 """Extract Indicators of Compromise from investigation context text. 61 62 Uses regex patterns to identify URLs, IPv4 addresses, domains, 63 hashes (MD5/SHA1/SHA256), email addresses, Windows file paths, 64 executable filenames, and known malicious tool keywords. 65 66 Args: 67 investigation_context: Free-text investigation context string. 68 69 Returns: 70 A dict mapping IOC category names to deduplicated lists of 71 extracted values. Returns an empty dict if no IOCs are found. 72 """ 73 text = stringify_value(investigation_context) 74 if not text: 75 return {} 76 77 urls = unique_preserve_order(IOC_URL_RE.findall(text)) 78 ips = unique_preserve_order(IOC_IPV4_RE.findall(text)) 79 hashes = unique_preserve_order(IOC_HASH_RE.findall(text)) 80 emails = unique_preserve_order(IOC_EMAIL_RE.findall(text)) 81 windows_paths = unique_preserve_order(WINDOWS_PATH_RE.findall(text)) 82 file_names = unique_preserve_order(IOC_FILENAME_RE.findall(text)) 83 file_names_lower = {value.lower() for value in file_names} 84 tools = extract_tool_keywords(text) 85 86 domain_candidates = unique_preserve_order(IOC_DOMAIN_RE.findall(text)) 87 domains: list[str] = [] 88 url_hosts = {extract_url_host(url) for url in urls} 89 for domain in domain_candidates: 90 lowered = domain.lower() 91 if lowered in url_hosts: 92 continue 93 if lowered in file_names_lower: 94 continue 95 if any(lowered.endswith(suffix) for suffix in DOMAIN_EXCLUDED_SUFFIXES): 96 continue 97 domains.append(domain) 98 domains = unique_preserve_order(domains) 99 100 iocs: dict[str, list[str]] = {} 101 if urls: 102 iocs["URLs"] = urls 103 if ips: 104 iocs["IPv4"] = ips 105 if domains: 106 iocs["Domains"] = domains 107 if hashes: 108 iocs["Hashes"] = hashes 109 if emails: 110 iocs["Emails"] = emails 111 if windows_paths: 112 iocs["FilePaths"] = windows_paths 113 if file_names: 114 iocs["FileNames"] = file_names 115 if tools: 116 iocs["SuspiciousTools"] = tools 117 return iocs
Extract Indicators of Compromise from investigation context text.
Uses regex patterns to identify URLs, IPv4 addresses, domains, hashes (MD5/SHA1/SHA256), email addresses, Windows file paths, executable filenames, and known malicious tool keywords.
Arguments:
- investigation_context: Free-text investigation context string.
Returns:
A dict mapping IOC category names to deduplicated lists of extracted values. Returns an empty dict if no IOCs are found.
120def format_ioc_targets(investigation_context: str) -> str: 121 """Format extracted IOC targets as a human-readable bullet list. 122 123 Args: 124 investigation_context: Free-text investigation context string. 125 126 Returns: 127 A multi-line string with one bullet per IOC category (up to 128 20 values each), or a message indicating no IOCs were found. 129 """ 130 ioc_map = extract_ioc_targets(investigation_context) 131 if not ioc_map: 132 return "No explicit IOC patterns were extracted from the investigation context." 133 134 lines = [] 135 for category, values in ioc_map.items(): 136 limited = values[:20] 137 suffix = "" if len(values) <= 20 else " ... [truncated]" 138 lines.append(f"- {category}: {', '.join(limited)}{suffix}") 139 return "\n".join(lines)
Format extracted IOC targets as a human-readable bullet list.
Arguments:
- investigation_context: Free-text investigation context string.
Returns:
A multi-line string with one bullet per IOC category (up to 20 values each), or a message indicating no IOCs were found.
142def build_priority_directives(investigation_context: str) -> str: 143 """Build numbered priority directives for the AI analysis prompt. 144 145 Generates a set of directives that instruct the AI to prioritize 146 the user's investigation context, check IOCs, and run standard 147 DFIR checks. 148 149 Args: 150 investigation_context: Free-text investigation context string. 151 152 Returns: 153 A multi-line numbered list of priority directives. 154 """ 155 ioc_map = extract_ioc_targets(investigation_context) 156 has_iocs = bool(ioc_map) 157 lines = [ 158 "1. Treat the user investigation context as highest priority and address it before generic hunting.", 159 ( 160 "2. For each IOC listed below, explicitly classify it as Observed, Not Observed, or Not Assessable " 161 "in this artifact." 162 if has_iocs 163 else "2. No explicit IOC was extracted; still prioritize user-stated hypotheses and suspicious themes." 164 ), 165 "3. Always run default DFIR checks: privilege escalation, credential access tooling (including Mimikatz-like activity), persistence, defense evasion, lateral movement, and potential exfiltration.", 166 "4. Focus on evidence that improves triage or containment decisions; keep baseline/statistical context secondary.", 167 ] 168 return "\n".join(lines)
Build numbered priority directives for the AI analysis prompt.
Generates a set of directives that instruct the AI to prioritize the user's investigation context, check IOCs, and run standard DFIR checks.
Arguments:
- investigation_context: Free-text investigation context string.
Returns:
A multi-line numbered list of priority directives.
171def build_artifact_final_context_reminder( 172 artifact_key: str, 173 artifact_name: str, 174 investigation_context: str, 175) -> str: 176 """Build a short end-of-prompt reminder that survives left-side truncation. 177 178 Places critical context (artifact identity, investigation focus, IOC 179 targets, DFIR checks) at the very end of the prompt so that models 180 with left-side attention decay still see the most important instructions. 181 182 Args: 183 artifact_key: Unique identifier for the artifact. 184 artifact_name: Human-readable artifact name. 185 investigation_context: The user's investigation context text. 186 187 Returns: 188 A multi-line reminder section string starting with a Markdown 189 heading. 190 """ 191 context_text = stringify_value(investigation_context) 192 if context_text: 193 context_text = truncate_for_prompt(context_text, limit=1200) 194 else: 195 context_text = "No investigation context provided." 196 197 ioc_targets = format_ioc_targets(investigation_context) 198 ioc_targets = truncate_for_prompt(ioc_targets, limit=1200) 199 200 lines = [ 201 "## Final Context Reminder (Do Not Ignore)", 202 f"- Artifact key: {artifact_key}", 203 f"- Artifact name: {artifact_name}", 204 f"- Investigation context (mandatory): {context_text}", 205 f"- IOC targets (mandatory follow-through): {ioc_targets}", 206 "- Always run default DFIR checks: privilege escalation, credential-access/Mimikatz-like behavior, malicious program execution, persistence/evasion/lateral movement/exfiltration.", 207 "- If evidence is insufficient, mark IOC or DFIR check as Not Assessable.", 208 ] 209 return "\n".join(lines)
Build a short end-of-prompt reminder that survives left-side truncation.
Places critical context (artifact identity, investigation focus, IOC targets, DFIR checks) at the very end of the prompt so that models with left-side attention decay still see the most important instructions.
Arguments:
- artifact_key: Unique identifier for the artifact.
- artifact_name: Human-readable artifact name.
- investigation_context: The user's investigation context text.
Returns:
A multi-line reminder section string starting with a Markdown heading.
41def extract_tool_keywords(text: str) -> list[str]: 42 """Extract known malicious tool keyword matches from text. 43 44 Args: 45 text: Free-text string to scan for tool keywords. 46 47 Returns: 48 A deduplicated list of matched tool keyword strings, preserving 49 the order of first occurrence. 50 """ 51 lowered = text.lower() 52 hits: list[str] = [] 53 for keyword in KNOWN_MALICIOUS_TOOL_KEYWORDS: 54 if keyword in lowered: 55 hits.append(keyword) 56 return unique_preserve_order(hits)
Extract known malicious tool keyword matches from text.
Arguments:
- text: Free-text string to scan for tool keywords.
Returns:
A deduplicated list of matched tool keyword strings, preserving the order of first occurrence.