feat(integration): Implement AWS GuardDuty (#112)

TracecatHQ · May 10, 2024 · 9c8c154 · 9c8c154
1 parent 238dd7e
commit 9c8c154
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 1 deletion.
diff --git a/frontend/src/components/icons.tsx b/frontend/src/components/icons.tsx
@@ -187,6 +187,30 @@ export const Integrations: Record<
       </g>
     </svg>
   ),
+  aws_guardduty: ({ className, ...rest }: IconProps) => (
+    <svg
+      className={cn("rounded-full", className)}
+      height="100%"
+      width="100%"
+      viewBox="0 0 40 40"
+      xmlns="http://www.w3.org/2000/svg"
+      {...rest}
+    >
+      <defs>
+        <linearGradient x1="0%" y1="100%" x2="100%" y2="0%" id="a">
+          <stop stop-color="#BD0816" offset="0%" />
+          <stop stop-color="#FF5252" offset="100%" />
+        </linearGradient>
+      </defs>
+      <g fill="none" fill-rule="evenodd">
+        <path d="M0 0h40v40H0z" fill="url(#a)" />
+        <path
+          d="M25 21.814c-.001.19-.074 4.475-4 4.818V14.594l4 1.734v5.486zm-9-.015v-5.471l4-1.734v12.038c-3.927-.344-3.999-4.643-4-4.833zm9.699-6.258-5-2.167a.5.5 0 0 0-.398 0l-5 2.167A.499.499 0 0 0 15 16v5.799c0 .059.062 5.867 5.5 5.867 5.437 0 5.5-5.792 5.5-5.851V16c0-.2-.118-.38-.301-.459zM34 22.752l-1.794-.102a.479.479 0 0 0-.516.389 11.379 11.379 0 0 1-1.482 3.577.498.498 0 0 0 .09.64l1.342 1.197-3.188 3.185-1.191-1.336a.501.501 0 0 0-.64-.09 11.361 11.361 0 0 1-3.582 1.483.502.502 0 0 0-.389.517L22.753 34h-4.506l.103-1.789a.503.503 0 0 0-.389-.517 11.348 11.348 0 0 1-3.578-1.484.505.505 0 0 0-.641.089l-1.194 1.339-3.188-3.186 1.339-1.194a.497.497 0 0 0 .09-.639A11.351 11.351 0 0 1 9.31 23.04c-.054-.238-.245-.394-.517-.39L7 22.752v-4.506l1.797.103a.482.482 0 0 0 .515-.389 11.411 11.411 0 0 1 1.485-3.572.498.498 0 0 0-.09-.64L9.36 12.547l3.188-3.187 1.199 1.345c.162.181.434.22.64.09a11.405 11.405 0 0 1 3.574-1.479.5.5 0 0 0 .389-.516L18.247 7h4.506l-.103 1.802a.502.502 0 0 0 .389.516c1.268.287 2.47.784 3.571 1.48a.5.5 0 0 0 .64-.09l1.202-1.348 3.188 3.187-1.344 1.198a.499.499 0 0 0-.09.64 11.42 11.42 0 0 1 1.482 3.575.489.489 0 0 0 .516.389L34 18.246v4.506zm.844-5.398a.516.516 0 0 0-.372-.136l-1.909.109a12.42 12.42 0 0 0-1.291-3.113L32.7 12.94a.497.497 0 0 0 .021-.726l-3.936-3.935a.497.497 0 0 0-.726.021l-1.278 1.431a12.348 12.348 0 0 0-3.109-1.288l.109-1.915A.501.501 0 0 0 23.282 6h-5.564a.5.5 0 0 0-.499.528l.109 1.913a12.376 12.376 0 0 0-3.111 1.288L12.941 8.3a.497.497 0 0 0-.358-.167.59.59 0 0 0-.368.146l-3.936 3.935a.5.5 0 0 0 .021.726l1.43 1.276a12.394 12.394 0 0 0-1.292 3.111l-1.91-.109a.544.544 0 0 0-.372.136.5.5 0 0 0-.156.363v5.565a.502.502 0 0 0 .528.499l1.906-.109a12.33 12.33 0 0 0 1.289 3.117L8.3 28.059a.497.497 0 0 0-.021.726l3.936 3.935a.497.497 0 0 0 .726-.021l1.27-1.423a12.35 12.35 0 0 0 3.116 1.293l-.108 1.902a.501.501 0 0 0 .499.529h5.564a.5.5 0 0 0 .499-.529l-.108-1.901a12.374 12.374 0 0 0 3.119-1.292l1.267 1.421a.497.497 0 0 0 .358.167.473.473 0 0 0 .368-.146l3.936-3.935a.5.5 0 0 0-.021-.726l-1.426-1.272a12.275 12.275 0 0 0 1.29-3.115l1.908.109a.502.502 0 0 0 .528-.499v-5.565a.5.5 0 0 0-.156-.363z"
+          fill="#FFF"
+        />
+      </g>
+    </svg>
+  ),
   datadog: (props: IconProps) => (
     <svg
       role="img"

diff --git a/frontend/src/types/schemas.ts b/frontend/src/types/schemas.ts
@@ -30,6 +30,7 @@ export type ActionType = (typeof actionTypes)[number]
 /** All platforms that are supported by the system. */
 const integrationPlatforms = [
   "aws_cloudtrail",
+  "aws_guardduty",
   "datadog",
   "emailrep",
   "sublime",
@@ -49,6 +50,7 @@ export type IntegrationPlatform = (typeof integrationPlatforms)[number]
  */
 const integrationTypes = [
   "integrations.aws_cloudtrail.query_cloudtrail_logs",
+  "integrations.aws_guardduty.query_guardduty_findings",
   "integrations.datadog.list_detection_rules",
   "integrations.datadog.list_security_signals",
   "integrations.datadog.update_security_signal_state",

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,9 +26,11 @@ dependencies = [
     "colorlog",
     "croniter",
     "cryptography",
+    "diskcache==5.6.3",
     "fastapi",
     "httpx",
     "lancedb==0.6.3",
+    "mmh3==4.1.0",
     "loguru==0.7.2",
     "openai",
     "orjson",
@@ -54,7 +56,14 @@ Repository = "https://github.com/TracecatHQ/tracecat"
 
 [project.optional-dependencies]
 runner = ["aiosmtplib", "jsonpath_ng", "python-multipart"]
-dev = ["respx", "pytest", "python-dotenv", "pytest-asyncio", "minio"]
+dev = [
+    "respx",
+    "pytest",
+    "python-dotenv",
+    "pytest-asyncio",
+    "minio",
+    "boto3-stubs[cloudtrail,guardduty,s3]",
+]
 
 [tool.hatch.version]
 path = "tracecat/__init__.py"

diff --git a/tracecat/etl/aws_guardduty.py b/tracecat/etl/aws_guardduty.py
@@ -0,0 +1,142 @@
+"""ETL functions for AWS GuardDuty findings.
+
+API reference: https://docs.aws.amazon.com/guardduty/latest/ug/guardduty_finding-types-active.html
+"""
+
+import logging
+from collections.abc import Generator
+from datetime import datetime
+from functools import partial
+from itertools import chain
+from typing import TYPE_CHECKING
+
+import boto3
+import diskcache as dc
+import mmh3
+import polars as pl
+from tqdm.contrib.concurrent import thread_map
+
+from tracecat.config import TRACECAT__TRIAGE_DIR
+from tracecat.contexts import ctx_session_role
+from tracecat.logger import standard_logger
+
+if TYPE_CHECKING:
+    from mypy_boto3_guardduty.type_defs import GetFindingsResponseTypeDef
+
+logger = standard_logger("runner.aws_guardduty")
+
+# Supress botocore info logs
+logging.getLogger("botocore").setLevel(logging.CRITICAL)
+
+AWS_GUARDDUTY__TRIAGE_DIR = TRACECAT__TRIAGE_DIR / "aws_guardduty"
+AWS_GUARDDUTY__TRIAGE_DIR.mkdir(parents=True, exist_ok=True)
+
+GET_FINDINGS_MAX_CHUNK_SIZE = 50
+
+
+def _get_all_guardduty_findings(
+    chunk_size: int = GET_FINDINGS_MAX_CHUNK_SIZE,
+) -> pl.DataFrame:
+    """Get GuardDuty findings for the specified time range.
+
+    Args:
+        region: AWS region
+        start_time: ISO 8601 formatted start time
+        end_time: ISO 8601 formatted end time
+        max_results: Maximum number of findings to return
+        severity_threshold: Minimum severity threshold to return
+
+    Returns:
+        GuardDuty findings as a Polars DataFrame
+    """
+    client = boto3.client("guardduty")
+    list_findings_paginator = client.get_paginator("list_findings")
+
+    # For all regions and detectors, list findings
+    findings: list[GetFindingsResponseTypeDef] = []
+    detectors = client.list_detectors()["DetectorIds"]
+    chunk_size = min(chunk_size, GET_FINDINGS_MAX_CHUNK_SIZE)
+
+    def chunker(finding_ids: list[str]) -> Generator[list[str], None, None]:
+        for i in range(0, len(finding_ids), chunk_size):
+            yield finding_ids[i : i + chunk_size]
+
+    def getter(finding_ids: list[str], *, detector_id: str) -> list[str]:
+        client = boto3.client("guardduty")
+        findings = client.get_findings(DetectorId=detector_id, FindingIds=finding_ids)
+        return findings.get("Findings", [])
+
+    for detector_id in detectors:
+        finding_ids: list[str] = []
+        # TODO: Parallelize this?
+        for page in list_findings_paginator.paginate(DetectorId=detector_id):
+            finding_ids.extend(page.get("FindingIds", []))
+        logger.info(f"Found {len(finding_ids)} findings in detector {detector_id}")
+
+        detector_findings: list[list[str]] = thread_map(
+            partial(getter, detector_id=detector_id),
+            chunker(finding_ids=finding_ids),
+            desc="📂 Getting AWS GuardDuty findings",
+        )
+        findings.extend(chain.from_iterable(detector_findings))
+
+    logger.info(f"Retrieved {len(findings)} GuardDuty findings")
+    df = pl.DataFrame(findings)
+    return df
+
+
+GUARDDUTY_DEFAULT_STRUCT_COLS = ["Service", "Resource"]
+
+
+def _stringify_struct_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.LazyFrame:
+    return df.lazy().with_columns(
+        pl.col(c).struct.json_encode() for c in GUARDDUTY_DEFAULT_STRUCT_COLS
+    )
+
+
+def load_guardduty_findings(
+    start: datetime,
+    end: datetime,
+    account_id: str,
+    organization_id: str,
+) -> pl.LazyFrame:
+    """Load AWS GuardDuty findings for the specified time range.
+
+    Caches and reads from disk to avoid repeated (expensive) API calls.
+
+    Args:
+        regions: AWS regions to load findings from
+        chunk_size: Maximum number of findings to load per request
+
+    Returns:
+        GuardDuty findings as a Polars DataFrame
+    """
+    # Include the session role in the cache key to avoid collisions
+    # when possibly serving multiple users concurrently
+    role = ctx_session_role.get()
+    logger.info(f"Loading GuardDuty findings for role {role}")
+
+    key = mmh3.hash(
+        f"{role}:{start}{end}{account_id}{organization_id}".encode(), seed=42
+    )
+
+    df: pl.DataFrame
+    dt_col = "CreatedAt"
+    with dc.Cache(directory=AWS_GUARDDUTY__TRIAGE_DIR) as cache:
+        if key in cache:
+            logger.info("Cache hit for GuardDuty findings")
+            # Structs here are already stringified
+            df = cache[key]
+        else:
+            logger.info("Cache miss for GuardDuty findings")
+            df = (
+                _get_all_guardduty_findings()
+                .lazy()
+                .pipe(_stringify_struct_columns)
+                .collect(streaming=True)
+            )
+            # Cache for 10 minutes
+            cache.set(key=key, value=df, expire=600)
+        # Apply time range filter
+        df = df.filter(pl.col(dt_col).is_between(start, end))
+        return df.lazy()
diff --git a/tracecat/integrations/__init__.py b/tracecat/integrations/__init__.py
@@ -3,6 +3,7 @@
 # Import modules to register integrations
 from tracecat.integrations import (
     aws_cloudtrail,
+    aws_guardduty,
     datadog,
     emailrep,
     project_discovery,
@@ -18,6 +19,7 @@
     "registry",
     # Integrations
     "aws_cloudtrail",
+    "aws_guardduty",
     "datadog",
     "emailrep",
     "project_discovery",

diff --git a/tracecat/integrations/aws_guardduty.py b/tracecat/integrations/aws_guardduty.py
@@ -0,0 +1,43 @@
+"""Native integration to query AWS GuardDuty findings.
+
+Optional secrets: `aws-guardduty` secret with keys `AWS_ACCOUNT_ID` and `AWS_ORGANIZATION_ID`.
+
+Note: this integration DOES NOT support IAM credential based authentication.
+Secrets are only used to obscure potentially sensitive data (account ID, organization ID).
+"""
+
+import os
+from typing import Any
+
+import dateutil.parser
+
+from tracecat.etl.aws_guardduty import load_guardduty_findings
+from tracecat.etl.query_builder import pl_sql_query
+from tracecat.integrations._registry import registry
+
+
+@registry.register(
+    description="Query AWS GuardDuty findings", secrets=["aws-guardduty"]
+)
+def query_guardduty_findings(
+    start: str,
+    end: str,
+    query: str,
+    account_id: str | None = None,
+    organization_id: str | None = None,
+) -> list[dict[str, Any]]:
+    account_id = account_id or os.environ["AWS_ACCOUNT_ID"]
+    organization_id = organization_id or os.environ["AWS_ORGANIZATION_ID"]
+    start_dt = dateutil.parser.parse(start)
+    end_dt = dateutil.parser.parse(end)
+    # Hash the function call args
+    # to use as a cache key
+    # We need to use the session role to compute the cache key
+    findings_lf = load_guardduty_findings(
+        start=start_dt,
+        end=end_dt,
+        account_id=account_id,
+        organization_id=organization_id,
+    )
+    queried_findings = pl_sql_query(lf=findings_lf, query=query, eager=True).to_dicts()
+    return queried_findings
diff --git a/tracecat/types/actions.py b/tracecat/types/actions.py
@@ -18,6 +18,7 @@
     "open_case",
     # Integrations
     "integrations.aws_cloudtrail.query_cloudtrail_logs",
+    "integrations.aws_guardduty.query_guardduty_findings",
     "integrations.datadog.list_detection_rules",
     "integrations.datadog.list_security_signals",
     "integrations.datadog.update_security_signal_state",