Skip to content

Commit

Permalink
feat(integration): Implement AWS GuardDuty (#112)
Browse files Browse the repository at this point in the history
  • Loading branch information
daryllimyt committed May 10, 2024
1 parent 238dd7e commit 9c8c154
Show file tree
Hide file tree
Showing 7 changed files with 224 additions and 1 deletion.
24 changes: 24 additions & 0 deletions frontend/src/components/icons.tsx
Expand Up @@ -187,6 +187,30 @@ export const Integrations: Record<
</g>
</svg>
),
aws_guardduty: ({ className, ...rest }: IconProps) => (
<svg
className={cn("rounded-full", className)}
height="100%"
width="100%"
viewBox="0 0 40 40"
xmlns="http://www.w3.org/2000/svg"
{...rest}
>
<defs>
<linearGradient x1="0%" y1="100%" x2="100%" y2="0%" id="a">
<stop stop-color="#BD0816" offset="0%" />
<stop stop-color="#FF5252" offset="100%" />
</linearGradient>
</defs>
<g fill="none" fill-rule="evenodd">
<path d="M0 0h40v40H0z" fill="url(#a)" />
<path
d="M25 21.814c-.001.19-.074 4.475-4 4.818V14.594l4 1.734v5.486zm-9-.015v-5.471l4-1.734v12.038c-3.927-.344-3.999-4.643-4-4.833zm9.699-6.258-5-2.167a.5.5 0 0 0-.398 0l-5 2.167A.499.499 0 0 0 15 16v5.799c0 .059.062 5.867 5.5 5.867 5.437 0 5.5-5.792 5.5-5.851V16c0-.2-.118-.38-.301-.459zM34 22.752l-1.794-.102a.479.479 0 0 0-.516.389 11.379 11.379 0 0 1-1.482 3.577.498.498 0 0 0 .09.64l1.342 1.197-3.188 3.185-1.191-1.336a.501.501 0 0 0-.64-.09 11.361 11.361 0 0 1-3.582 1.483.502.502 0 0 0-.389.517L22.753 34h-4.506l.103-1.789a.503.503 0 0 0-.389-.517 11.348 11.348 0 0 1-3.578-1.484.505.505 0 0 0-.641.089l-1.194 1.339-3.188-3.186 1.339-1.194a.497.497 0 0 0 .09-.639A11.351 11.351 0 0 1 9.31 23.04c-.054-.238-.245-.394-.517-.39L7 22.752v-4.506l1.797.103a.482.482 0 0 0 .515-.389 11.411 11.411 0 0 1 1.485-3.572.498.498 0 0 0-.09-.64L9.36 12.547l3.188-3.187 1.199 1.345c.162.181.434.22.64.09a11.405 11.405 0 0 1 3.574-1.479.5.5 0 0 0 .389-.516L18.247 7h4.506l-.103 1.802a.502.502 0 0 0 .389.516c1.268.287 2.47.784 3.571 1.48a.5.5 0 0 0 .64-.09l1.202-1.348 3.188 3.187-1.344 1.198a.499.499 0 0 0-.09.64 11.42 11.42 0 0 1 1.482 3.575.489.489 0 0 0 .516.389L34 18.246v4.506zm.844-5.398a.516.516 0 0 0-.372-.136l-1.909.109a12.42 12.42 0 0 0-1.291-3.113L32.7 12.94a.497.497 0 0 0 .021-.726l-3.936-3.935a.497.497 0 0 0-.726.021l-1.278 1.431a12.348 12.348 0 0 0-3.109-1.288l.109-1.915A.501.501 0 0 0 23.282 6h-5.564a.5.5 0 0 0-.499.528l.109 1.913a12.376 12.376 0 0 0-3.111 1.288L12.941 8.3a.497.497 0 0 0-.358-.167.59.59 0 0 0-.368.146l-3.936 3.935a.5.5 0 0 0 .021.726l1.43 1.276a12.394 12.394 0 0 0-1.292 3.111l-1.91-.109a.544.544 0 0 0-.372.136.5.5 0 0 0-.156.363v5.565a.502.502 0 0 0 .528.499l1.906-.109a12.33 12.33 0 0 0 1.289 3.117L8.3 28.059a.497.497 0 0 0-.021.726l3.936 3.935a.497.497 0 0 0 .726-.021l1.27-1.423a12.35 12.35 0 0 0 3.116 1.293l-.108 1.902a.501.501 0 0 0 .499.529h5.564a.5.5 0 0 0 .499-.529l-.108-1.901a12.374 12.374 0 0 0 3.119-1.292l1.267 1.421a.497.497 0 0 0 .358.167.473.473 0 0 0 .368-.146l3.936-3.935a.5.5 0 0 0-.021-.726l-1.426-1.272a12.275 12.275 0 0 0 1.29-3.115l1.908.109a.502.502 0 0 0 .528-.499v-5.565a.5.5 0 0 0-.156-.363z"
fill="#FFF"
/>
</g>
</svg>
),
datadog: (props: IconProps) => (
<svg
role="img"
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/types/schemas.ts
Expand Up @@ -30,6 +30,7 @@ export type ActionType = (typeof actionTypes)[number]
/** All platforms that are supported by the system. */
const integrationPlatforms = [
"aws_cloudtrail",
"aws_guardduty",
"datadog",
"emailrep",
"sublime",
Expand All @@ -49,6 +50,7 @@ export type IntegrationPlatform = (typeof integrationPlatforms)[number]
*/
const integrationTypes = [
"integrations.aws_cloudtrail.query_cloudtrail_logs",
"integrations.aws_guardduty.query_guardduty_findings",
"integrations.datadog.list_detection_rules",
"integrations.datadog.list_security_signals",
"integrations.datadog.update_security_signal_state",
Expand Down
11 changes: 10 additions & 1 deletion pyproject.toml
Expand Up @@ -26,9 +26,11 @@ dependencies = [
"colorlog",
"croniter",
"cryptography",
"diskcache==5.6.3",
"fastapi",
"httpx",
"lancedb==0.6.3",
"mmh3==4.1.0",
"loguru==0.7.2",
"openai",
"orjson",
Expand All @@ -54,7 +56,14 @@ Repository = "https://github.com/TracecatHQ/tracecat"

[project.optional-dependencies]
runner = ["aiosmtplib", "jsonpath_ng", "python-multipart"]
dev = ["respx", "pytest", "python-dotenv", "pytest-asyncio", "minio"]
dev = [
"respx",
"pytest",
"python-dotenv",
"pytest-asyncio",
"minio",
"boto3-stubs[cloudtrail,guardduty,s3]",
]

[tool.hatch.version]
path = "tracecat/__init__.py"
Expand Down
142 changes: 142 additions & 0 deletions tracecat/etl/aws_guardduty.py
@@ -0,0 +1,142 @@
"""ETL functions for AWS GuardDuty findings.
API reference: https://docs.aws.amazon.com/guardduty/latest/ug/guardduty_finding-types-active.html
"""

import logging
from collections.abc import Generator
from datetime import datetime
from functools import partial
from itertools import chain
from typing import TYPE_CHECKING

import boto3
import diskcache as dc
import mmh3
import polars as pl
from tqdm.contrib.concurrent import thread_map

from tracecat.config import TRACECAT__TRIAGE_DIR
from tracecat.contexts import ctx_session_role
from tracecat.logger import standard_logger

if TYPE_CHECKING:
from mypy_boto3_guardduty.type_defs import GetFindingsResponseTypeDef

logger = standard_logger("runner.aws_guardduty")

# Supress botocore info logs
logging.getLogger("botocore").setLevel(logging.CRITICAL)

AWS_GUARDDUTY__TRIAGE_DIR = TRACECAT__TRIAGE_DIR / "aws_guardduty"
AWS_GUARDDUTY__TRIAGE_DIR.mkdir(parents=True, exist_ok=True)

GET_FINDINGS_MAX_CHUNK_SIZE = 50


def _get_all_guardduty_findings(
chunk_size: int = GET_FINDINGS_MAX_CHUNK_SIZE,
) -> pl.DataFrame:
"""Get GuardDuty findings for the specified time range.
Args:
region: AWS region
start_time: ISO 8601 formatted start time
end_time: ISO 8601 formatted end time
max_results: Maximum number of findings to return
severity_threshold: Minimum severity threshold to return
Returns:
GuardDuty findings as a Polars DataFrame
"""
client = boto3.client("guardduty")
list_findings_paginator = client.get_paginator("list_findings")

# For all regions and detectors, list findings
findings: list[GetFindingsResponseTypeDef] = []
detectors = client.list_detectors()["DetectorIds"]
chunk_size = min(chunk_size, GET_FINDINGS_MAX_CHUNK_SIZE)

def chunker(finding_ids: list[str]) -> Generator[list[str], None, None]:
for i in range(0, len(finding_ids), chunk_size):
yield finding_ids[i : i + chunk_size]

def getter(finding_ids: list[str], *, detector_id: str) -> list[str]:
client = boto3.client("guardduty")
findings = client.get_findings(DetectorId=detector_id, FindingIds=finding_ids)
return findings.get("Findings", [])

for detector_id in detectors:
finding_ids: list[str] = []
# TODO: Parallelize this?
for page in list_findings_paginator.paginate(DetectorId=detector_id):
finding_ids.extend(page.get("FindingIds", []))
logger.info(f"Found {len(finding_ids)} findings in detector {detector_id}")

detector_findings: list[list[str]] = thread_map(
partial(getter, detector_id=detector_id),
chunker(finding_ids=finding_ids),
desc="馃搨 Getting AWS GuardDuty findings",
)
findings.extend(chain.from_iterable(detector_findings))

logger.info(f"Retrieved {len(findings)} GuardDuty findings")
df = pl.DataFrame(findings)
return df


GUARDDUTY_DEFAULT_STRUCT_COLS = ["Service", "Resource"]


def _stringify_struct_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.LazyFrame:
return df.lazy().with_columns(
pl.col(c).struct.json_encode() for c in GUARDDUTY_DEFAULT_STRUCT_COLS
)


def load_guardduty_findings(
start: datetime,
end: datetime,
account_id: str,
organization_id: str,
) -> pl.LazyFrame:
"""Load AWS GuardDuty findings for the specified time range.
Caches and reads from disk to avoid repeated (expensive) API calls.
Args:
regions: AWS regions to load findings from
chunk_size: Maximum number of findings to load per request
Returns:
GuardDuty findings as a Polars DataFrame
"""
# Include the session role in the cache key to avoid collisions
# when possibly serving multiple users concurrently
role = ctx_session_role.get()
logger.info(f"Loading GuardDuty findings for role {role}")

key = mmh3.hash(
f"{role}:{start}{end}{account_id}{organization_id}".encode(), seed=42
)

df: pl.DataFrame
dt_col = "CreatedAt"
with dc.Cache(directory=AWS_GUARDDUTY__TRIAGE_DIR) as cache:
if key in cache:
logger.info("Cache hit for GuardDuty findings")
# Structs here are already stringified
df = cache[key]
else:
logger.info("Cache miss for GuardDuty findings")
df = (
_get_all_guardduty_findings()
.lazy()
.pipe(_stringify_struct_columns)
.collect(streaming=True)
)
# Cache for 10 minutes
cache.set(key=key, value=df, expire=600)
# Apply time range filter
df = df.filter(pl.col(dt_col).is_between(start, end))
return df.lazy()
2 changes: 2 additions & 0 deletions tracecat/integrations/__init__.py
Expand Up @@ -3,6 +3,7 @@
# Import modules to register integrations
from tracecat.integrations import (
aws_cloudtrail,
aws_guardduty,
datadog,
emailrep,
project_discovery,
Expand All @@ -18,6 +19,7 @@
"registry",
# Integrations
"aws_cloudtrail",
"aws_guardduty",
"datadog",
"emailrep",
"project_discovery",
Expand Down
43 changes: 43 additions & 0 deletions tracecat/integrations/aws_guardduty.py
@@ -0,0 +1,43 @@
"""Native integration to query AWS GuardDuty findings.
Optional secrets: `aws-guardduty` secret with keys `AWS_ACCOUNT_ID` and `AWS_ORGANIZATION_ID`.
Note: this integration DOES NOT support IAM credential based authentication.
Secrets are only used to obscure potentially sensitive data (account ID, organization ID).
"""

import os
from typing import Any

import dateutil.parser

from tracecat.etl.aws_guardduty import load_guardduty_findings
from tracecat.etl.query_builder import pl_sql_query
from tracecat.integrations._registry import registry


@registry.register(
description="Query AWS GuardDuty findings", secrets=["aws-guardduty"]
)
def query_guardduty_findings(
start: str,
end: str,
query: str,
account_id: str | None = None,
organization_id: str | None = None,
) -> list[dict[str, Any]]:
account_id = account_id or os.environ["AWS_ACCOUNT_ID"]
organization_id = organization_id or os.environ["AWS_ORGANIZATION_ID"]
start_dt = dateutil.parser.parse(start)
end_dt = dateutil.parser.parse(end)
# Hash the function call args
# to use as a cache key
# We need to use the session role to compute the cache key
findings_lf = load_guardduty_findings(
start=start_dt,
end=end_dt,
account_id=account_id,
organization_id=organization_id,
)
queried_findings = pl_sql_query(lf=findings_lf, query=query, eager=True).to_dicts()
return queried_findings
1 change: 1 addition & 0 deletions tracecat/types/actions.py
Expand Up @@ -18,6 +18,7 @@
"open_case",
# Integrations
"integrations.aws_cloudtrail.query_cloudtrail_logs",
"integrations.aws_guardduty.query_guardduty_findings",
"integrations.datadog.list_detection_rules",
"integrations.datadog.list_security_signals",
"integrations.datadog.update_security_signal_state",
Expand Down

0 comments on commit 9c8c154

Please sign in to comment.