app.analyzer.ioc

IOC extraction and prompt-building helpers for the forensic analyzer.

Extracts Indicators of Compromise (URLs, IPs, domains, hashes, emails, file paths, filenames, suspicious tool keywords) from investigation context text, and formats them into prompt sections for AI analysis.

Attributes:

LOGGER: Module-level logger instance.

View Source

  1"""IOC extraction and prompt-building helpers for the forensic analyzer.
  2
  3Extracts Indicators of Compromise (URLs, IPs, domains, hashes, emails,
  4file paths, filenames, suspicious tool keywords) from investigation context
  5text, and formats them into prompt sections for AI analysis.
  6
  7Attributes:
  8    LOGGER: Module-level logger instance.
  9"""
 10
 11from __future__ import annotations
 12
 13from .constants import (
 14    DOMAIN_EXCLUDED_SUFFIXES,
 15    IOC_DOMAIN_RE,
 16    IOC_EMAIL_RE,
 17    IOC_FILENAME_RE,
 18    IOC_HASH_RE,
 19    IOC_IPV4_RE,
 20    IOC_URL_RE,
 21    KNOWN_MALICIOUS_TOOL_KEYWORDS,
 22    WINDOWS_PATH_RE,
 23)
 24from .utils import (
 25    extract_url_host,
 26    stringify_value,
 27    truncate_for_prompt,
 28    unique_preserve_order,
 29)
 30
 31__all__ = [
 32    "extract_ioc_targets",
 33    "format_ioc_targets",
 34    "build_priority_directives",
 35    "build_artifact_final_context_reminder",
 36    "extract_tool_keywords",
 37]
 38
 39
 40def extract_tool_keywords(text: str) -> list[str]:
 41    """Extract known malicious tool keyword matches from text.
 42
 43    Args:
 44        text: Free-text string to scan for tool keywords.
 45
 46    Returns:
 47        A deduplicated list of matched tool keyword strings, preserving
 48        the order of first occurrence.
 49    """
 50    lowered = text.lower()
 51    hits: list[str] = []
 52    for keyword in KNOWN_MALICIOUS_TOOL_KEYWORDS:
 53        if keyword in lowered:
 54            hits.append(keyword)
 55    return unique_preserve_order(hits)
 56
 57
 58def extract_ioc_targets(investigation_context: str) -> dict[str, list[str]]:
 59    """Extract Indicators of Compromise from investigation context text.
 60
 61    Uses regex patterns to identify URLs, IPv4 addresses, domains,
 62    hashes (MD5/SHA1/SHA256), email addresses, Windows file paths,
 63    executable filenames, and known malicious tool keywords.
 64
 65    Args:
 66        investigation_context: Free-text investigation context string.
 67
 68    Returns:
 69        A dict mapping IOC category names to deduplicated lists of
 70        extracted values.  Returns an empty dict if no IOCs are found.
 71    """
 72    text = stringify_value(investigation_context)
 73    if not text:
 74        return {}
 75
 76    urls = unique_preserve_order(IOC_URL_RE.findall(text))
 77    ips = unique_preserve_order(IOC_IPV4_RE.findall(text))
 78    hashes = unique_preserve_order(IOC_HASH_RE.findall(text))
 79    emails = unique_preserve_order(IOC_EMAIL_RE.findall(text))
 80    windows_paths = unique_preserve_order(WINDOWS_PATH_RE.findall(text))
 81    file_names = unique_preserve_order(IOC_FILENAME_RE.findall(text))
 82    file_names_lower = {value.lower() for value in file_names}
 83    tools = extract_tool_keywords(text)
 84
 85    domain_candidates = unique_preserve_order(IOC_DOMAIN_RE.findall(text))
 86    domains: list[str] = []
 87    url_hosts = {extract_url_host(url) for url in urls}
 88    for domain in domain_candidates:
 89        lowered = domain.lower()
 90        if lowered in url_hosts:
 91            continue
 92        if lowered in file_names_lower:
 93            continue
 94        if any(lowered.endswith(suffix) for suffix in DOMAIN_EXCLUDED_SUFFIXES):
 95            continue
 96        domains.append(domain)
 97    domains = unique_preserve_order(domains)
 98
 99    iocs: dict[str, list[str]] = {}
100    if urls:
101        iocs["URLs"] = urls
102    if ips:
103        iocs["IPv4"] = ips
104    if domains:
105        iocs["Domains"] = domains
106    if hashes:
107        iocs["Hashes"] = hashes
108    if emails:
109        iocs["Emails"] = emails
110    if windows_paths:
111        iocs["FilePaths"] = windows_paths
112    if file_names:
113        iocs["FileNames"] = file_names
114    if tools:
115        iocs["SuspiciousTools"] = tools
116    return iocs
117
118
119def format_ioc_targets(investigation_context: str) -> str:
120    """Format extracted IOC targets as a human-readable bullet list.
121
122    Args:
123        investigation_context: Free-text investigation context string.
124
125    Returns:
126        A multi-line string with one bullet per IOC category (up to
127        20 values each), or a message indicating no IOCs were found.
128    """
129    ioc_map = extract_ioc_targets(investigation_context)
130    if not ioc_map:
131        return "No explicit IOC patterns were extracted from the investigation context."
132
133    lines = []
134    for category, values in ioc_map.items():
135        limited = values[:20]
136        suffix = "" if len(values) <= 20 else " ... [truncated]"
137        lines.append(f"- {category}: {', '.join(limited)}{suffix}")
138    return "\n".join(lines)
139
140
141def build_priority_directives(investigation_context: str) -> str:
142    """Build numbered priority directives for the AI analysis prompt.
143
144    Generates a set of directives that instruct the AI to prioritize
145    the user's investigation context, check IOCs, and run standard
146    DFIR checks.
147
148    Args:
149        investigation_context: Free-text investigation context string.
150
151    Returns:
152        A multi-line numbered list of priority directives.
153    """
154    ioc_map = extract_ioc_targets(investigation_context)
155    has_iocs = bool(ioc_map)
156    lines = [
157        "1. Treat the user investigation context as highest priority and address it before generic hunting.",
158        (
159            "2. For each IOC listed below, explicitly classify it as Observed, Not Observed, or Not Assessable "
160            "in this artifact."
161            if has_iocs
162            else "2. No explicit IOC was extracted; still prioritize user-stated hypotheses and suspicious themes."
163        ),
164        "3. Always run default DFIR checks: privilege escalation, credential access tooling (including Mimikatz-like activity), persistence, defense evasion, lateral movement, and potential exfiltration.",
165        "4. Focus on evidence that improves triage or containment decisions; keep baseline/statistical context secondary.",
166    ]
167    return "\n".join(lines)
168
169
170def build_artifact_final_context_reminder(
171    artifact_key: str,
172    artifact_name: str,
173    investigation_context: str,
174) -> str:
175    """Build a short end-of-prompt reminder that survives left-side truncation.
176
177    Places critical context (artifact identity, investigation focus, IOC
178    targets, DFIR checks) at the very end of the prompt so that models
179    with left-side attention decay still see the most important instructions.
180
181    Args:
182        artifact_key: Unique identifier for the artifact.
183        artifact_name: Human-readable artifact name.
184        investigation_context: The user's investigation context text.
185
186    Returns:
187        A multi-line reminder section string starting with a Markdown
188        heading.
189    """
190    context_text = stringify_value(investigation_context)
191    if context_text:
192        context_text = truncate_for_prompt(context_text, limit=1200)
193    else:
194        context_text = "No investigation context provided."
195
196    ioc_targets = format_ioc_targets(investigation_context)
197    ioc_targets = truncate_for_prompt(ioc_targets, limit=1200)
198
199    lines = [
200        "## Final Context Reminder (Do Not Ignore)",
201        f"- Artifact key: {artifact_key}",
202        f"- Artifact name: {artifact_name}",
203        f"- Investigation context (mandatory): {context_text}",
204        f"- IOC targets (mandatory follow-through): {ioc_targets}",
205        "- Always run default DFIR checks: privilege escalation, credential-access/Mimikatz-like behavior, malicious program execution, persistence/evasion/lateral movement/exfiltration.",
206        "- If evidence is insufficient, mark IOC or DFIR check as Not Assessable.",
207    ]
208    return "\n".join(lines)

def extract_ioc_targets(investigation_context: str) -> dict[str, list[str]]: View Source

 59def extract_ioc_targets(investigation_context: str) -> dict[str, list[str]]:
 60    """Extract Indicators of Compromise from investigation context text.
 61
 62    Uses regex patterns to identify URLs, IPv4 addresses, domains,
 63    hashes (MD5/SHA1/SHA256), email addresses, Windows file paths,
 64    executable filenames, and known malicious tool keywords.
 65
 66    Args:
 67        investigation_context: Free-text investigation context string.
 68
 69    Returns:
 70        A dict mapping IOC category names to deduplicated lists of
 71        extracted values.  Returns an empty dict if no IOCs are found.
 72    """
 73    text = stringify_value(investigation_context)
 74    if not text:
 75        return {}
 76
 77    urls = unique_preserve_order(IOC_URL_RE.findall(text))
 78    ips = unique_preserve_order(IOC_IPV4_RE.findall(text))
 79    hashes = unique_preserve_order(IOC_HASH_RE.findall(text))
 80    emails = unique_preserve_order(IOC_EMAIL_RE.findall(text))
 81    windows_paths = unique_preserve_order(WINDOWS_PATH_RE.findall(text))
 82    file_names = unique_preserve_order(IOC_FILENAME_RE.findall(text))
 83    file_names_lower = {value.lower() for value in file_names}
 84    tools = extract_tool_keywords(text)
 85
 86    domain_candidates = unique_preserve_order(IOC_DOMAIN_RE.findall(text))
 87    domains: list[str] = []
 88    url_hosts = {extract_url_host(url) for url in urls}
 89    for domain in domain_candidates:
 90        lowered = domain.lower()
 91        if lowered in url_hosts:
 92            continue
 93        if lowered in file_names_lower:
 94            continue
 95        if any(lowered.endswith(suffix) for suffix in DOMAIN_EXCLUDED_SUFFIXES):
 96            continue
 97        domains.append(domain)
 98    domains = unique_preserve_order(domains)
 99
100    iocs: dict[str, list[str]] = {}
101    if urls:
102        iocs["URLs"] = urls
103    if ips:
104        iocs["IPv4"] = ips
105    if domains:
106        iocs["Domains"] = domains
107    if hashes:
108        iocs["Hashes"] = hashes
109    if emails:
110        iocs["Emails"] = emails
111    if windows_paths:
112        iocs["FilePaths"] = windows_paths
113    if file_names:
114        iocs["FileNames"] = file_names
115    if tools:
116        iocs["SuspiciousTools"] = tools
117    return iocs

Extract Indicators of Compromise from investigation context text.

Uses regex patterns to identify URLs, IPv4 addresses, domains, hashes (MD5/SHA1/SHA256), email addresses, Windows file paths, executable filenames, and known malicious tool keywords.

Arguments:

investigation_context: Free-text investigation context string.

Returns:

A dict mapping IOC category names to deduplicated lists of extracted values. Returns an empty dict if no IOCs are found.

def format_ioc_targets(investigation_context: str) -> str: View Source

120def format_ioc_targets(investigation_context: str) -> str:
121    """Format extracted IOC targets as a human-readable bullet list.
122
123    Args:
124        investigation_context: Free-text investigation context string.
125
126    Returns:
127        A multi-line string with one bullet per IOC category (up to
128        20 values each), or a message indicating no IOCs were found.
129    """
130    ioc_map = extract_ioc_targets(investigation_context)
131    if not ioc_map:
132        return "No explicit IOC patterns were extracted from the investigation context."
133
134    lines = []
135    for category, values in ioc_map.items():
136        limited = values[:20]
137        suffix = "" if len(values) <= 20 else " ... [truncated]"
138        lines.append(f"- {category}: {', '.join(limited)}{suffix}")
139    return "\n".join(lines)

Format extracted IOC targets as a human-readable bullet list.

Arguments:

investigation_context: Free-text investigation context string.

Returns:

A multi-line string with one bullet per IOC category (up to 20 values each), or a message indicating no IOCs were found.

def build_priority_directives(investigation_context: str) -> str: View Source

142def build_priority_directives(investigation_context: str) -> str:
143    """Build numbered priority directives for the AI analysis prompt.
144
145    Generates a set of directives that instruct the AI to prioritize
146    the user's investigation context, check IOCs, and run standard
147    DFIR checks.
148
149    Args:
150        investigation_context: Free-text investigation context string.
151
152    Returns:
153        A multi-line numbered list of priority directives.
154    """
155    ioc_map = extract_ioc_targets(investigation_context)
156    has_iocs = bool(ioc_map)
157    lines = [
158        "1. Treat the user investigation context as highest priority and address it before generic hunting.",
159        (
160            "2. For each IOC listed below, explicitly classify it as Observed, Not Observed, or Not Assessable "
161            "in this artifact."
162            if has_iocs
163            else "2. No explicit IOC was extracted; still prioritize user-stated hypotheses and suspicious themes."
164        ),
165        "3. Always run default DFIR checks: privilege escalation, credential access tooling (including Mimikatz-like activity), persistence, defense evasion, lateral movement, and potential exfiltration.",
166        "4. Focus on evidence that improves triage or containment decisions; keep baseline/statistical context secondary.",
167    ]
168    return "\n".join(lines)

Build numbered priority directives for the AI analysis prompt.

Generates a set of directives that instruct the AI to prioritize the user's investigation context, check IOCs, and run standard DFIR checks.

Arguments:

investigation_context: Free-text investigation context string.

Returns:

A multi-line numbered list of priority directives.

def build_artifact_final_context_reminder(artifact_key: str, artifact_name: str, investigation_context: str) -> str: View Source

171def build_artifact_final_context_reminder(
172    artifact_key: str,
173    artifact_name: str,
174    investigation_context: str,
175) -> str:
176    """Build a short end-of-prompt reminder that survives left-side truncation.
177
178    Places critical context (artifact identity, investigation focus, IOC
179    targets, DFIR checks) at the very end of the prompt so that models
180    with left-side attention decay still see the most important instructions.
181
182    Args:
183        artifact_key: Unique identifier for the artifact.
184        artifact_name: Human-readable artifact name.
185        investigation_context: The user's investigation context text.
186
187    Returns:
188        A multi-line reminder section string starting with a Markdown
189        heading.
190    """
191    context_text = stringify_value(investigation_context)
192    if context_text:
193        context_text = truncate_for_prompt(context_text, limit=1200)
194    else:
195        context_text = "No investigation context provided."
196
197    ioc_targets = format_ioc_targets(investigation_context)
198    ioc_targets = truncate_for_prompt(ioc_targets, limit=1200)
199
200    lines = [
201        "## Final Context Reminder (Do Not Ignore)",
202        f"- Artifact key: {artifact_key}",
203        f"- Artifact name: {artifact_name}",
204        f"- Investigation context (mandatory): {context_text}",
205        f"- IOC targets (mandatory follow-through): {ioc_targets}",
206        "- Always run default DFIR checks: privilege escalation, credential-access/Mimikatz-like behavior, malicious program execution, persistence/evasion/lateral movement/exfiltration.",
207        "- If evidence is insufficient, mark IOC or DFIR check as Not Assessable.",
208    ]
209    return "\n".join(lines)

Build a short end-of-prompt reminder that survives left-side truncation.

Places critical context (artifact identity, investigation focus, IOC targets, DFIR checks) at the very end of the prompt so that models with left-side attention decay still see the most important instructions.

Arguments:

artifact_key: Unique identifier for the artifact.
artifact_name: Human-readable artifact name.
investigation_context: The user's investigation context text.

Returns:

A multi-line reminder section string starting with a Markdown heading.

def extract_tool_keywords(text: str) -> list[str]: View Source

41def extract_tool_keywords(text: str) -> list[str]:
42    """Extract known malicious tool keyword matches from text.
43
44    Args:
45        text: Free-text string to scan for tool keywords.
46
47    Returns:
48        A deduplicated list of matched tool keyword strings, preserving
49        the order of first occurrence.
50    """
51    lowered = text.lower()
52    hits: list[str] = []
53    for keyword in KNOWN_MALICIOUS_TOOL_KEYWORDS:
54        if keyword in lowered:
55            hits.append(keyword)
56    return unique_preserve_order(hits)

Extract known malicious tool keyword matches from text.

Arguments:

text: Free-text string to scan for tool keywords.

Returns:

A deduplicated list of matched tool keyword strings, preserving the order of first occurrence.