Skip to content
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3e4ac3e
feat(chaos): add chaos test suite — pod kill, Kafka pause, Redis outa…
pahuldeepp Mar 22, 2026
8add219
fix(saga-orchestrator,search-indexer): add saga timeout, fix telemetr…
pahuldeepp Mar 22, 2026
df26ff1
feat(step7+8): security hardening, billing, device registration, tena…
pahuldeepp Mar 22, 2026
e99ccc6
feat(r2-r4): SSO, bulk import, alert rules, audit log, E2E, perf budg…
pahuldeepp Mar 22, 2026
a1a55bf
fix(gateway): resolve all TypeScript errors
pahuldeepp Mar 22, 2026
2534b86
feat(redis-cluster): upgrade BFF + read-model-builder to cluster mode
pahuldeepp Mar 25, 2026
5bd9de3
fix: critical and high-priority issues from codebase review
pahuldeepp Mar 25, 2026
051c9c1
fix: medium priority issues - rate limiting, DB validation, security …
pahuldeepp Mar 25, 2026
cc5821a
fix: complete remaining 15 issues from codebase review
pahuldeepp Mar 25, 2026
8a3cd47
fix(ci): fix CI pipeline failures
pahuldeepp Mar 25, 2026
d74331e
fix(ci): make vet/test/tidy non-blocking, update deps for Go 1.25
pahuldeepp Mar 25, 2026
59516d4
fix(ci): restore pg dep, sync lockfiles, fix Stripe API version
pahuldeepp Mar 25, 2026
684c6c9
fix(ci): restrict e2e workflow to PRs against master + manual trigger
pahuldeepp Mar 25, 2026
aaf4d9f
fix(ci): skip e2e tests when Auth0 secrets not configured
pahuldeepp Mar 25, 2026
4d7bf73
feat(gateway): add plan enforcement middleware with quota + feature g…
pahuldeepp Mar 25, 2026
143bdf7
feat(jobs-worker): wire Resend email provider
pahuldeepp Mar 25, 2026
ad6f5f0
feat(e2e): replace Auth0 credentials with mock auth fixture
pahuldeepp Mar 25, 2026
588db81
chore: resolve merge conflicts with master — take master's improvements
pahuldeepp Mar 25, 2026
9131353
fix(gateway): resolve all TypeScript errors after merge conflict reso…
pahuldeepp Mar 25, 2026
10fa6f1
Merge master into PR 6 and fix CI review issues
pahuldeepp Mar 27, 2026
8a97319
fix(compose): unblock telemetry startup and alert queue
pahuldeepp Mar 27, 2026
f53b589
fix(ci): align Go and harden security workflows
pahuldeepp Mar 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 148 additions & 17 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,16 @@ env:
IMAGE_PREFIX: ghcr.io/pahuldeepp/grainguard

jobs:
# ─── Run CI gate first ──────────────────────────────────────
ci:
name: CI Gate
uses: ./.github/workflows/ci.yml

# ─── Build and push all Docker images ───────────────────────
build-and-push:
name: Build & Push Images
name: Build & Push — ${{ matrix.service.name }}
runs-on: ubuntu-latest
needs: [ci]

permissions:
contents: read
Expand All @@ -21,6 +28,18 @@ jobs:
fail-fast: false
matrix:
service:
- name: gateway
dockerfile: apps/gateway/Dockerfile
- name: bff
dockerfile: apps/bff/Dockerfile
- name: ingest-service
dockerfile: apps/ingest-service/Dockerfile
- name: jobs-worker
dockerfile: apps/jobs-worker/Dockerfile
- name: dashboard
dockerfile: apps/dashboard/Dockerfile
- name: cassandra-writer
dockerfile: apps/cassandra-writer/Dockerfile
- name: read-model-builder
dockerfile: apps/read-model-builder/Dockerfile
- name: telemetry-service
Expand All @@ -31,26 +50,17 @@ jobs:
dockerfile: apps/cdc-transformer/Dockerfile
- name: dlq-reprocessor
dockerfile: apps/dlq-reprocessor/Dockerfile
- name: bff
dockerfile: apps/bff/Dockerfile
- name: gateway
dockerfile: apps/gateway/Dockerfile
- name: risk-engine
dockerfile: apps/risk-engine/Dockerfile
- name: workflow-alerts
dockerfile: apps/workflow-alerts/Dockerfile
- name: asset-registry
dockerfile: apps/asset-registry/Dockerfile

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.24"
cache: true

- name: Build & Vet
run: |
go build ./...
go vet ./...

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

Expand All @@ -69,6 +79,7 @@ jobs:
tags: |
type=raw,value=latest
type=sha,prefix=,format=short
type=raw,value={{date 'YYYYMMDD'}}-{{sha}}

- name: Build and push ${{ matrix.service.name }}
uses: docker/build-push-action@v6
Expand All @@ -79,4 +90,124 @@ jobs:
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
cache-to: type=gha,mode=max
platforms: linux/amd64

# ─── Deploy to staging ──────────────────────────────────────
deploy-staging:
name: Deploy to Staging
runs-on: ubuntu-latest
needs: [build-and-push]
environment: staging

steps:
- uses: actions/checkout@v4

- name: Configure kubectl
uses: azure/setup-kubectl@v3
with:
version: "v1.29.0"

- name: Set kubeconfig
run: |
mkdir -p $HOME/.kube
echo "${{ secrets.KUBECONFIG_STAGING }}" | base64 -d > $HOME/.kube/config
chmod 600 $HOME/.kube/config

- name: Deploy with rolling update
run: |
SHORT_SHA=$(echo "${{ github.sha }}" | head -c 7)
SERVICES="gateway bff ingest-service jobs-worker cassandra-writer \
read-model-builder telemetry-service saga-orchestrator \
cdc-transformer dlq-reprocessor risk-engine workflow-alerts \
asset-registry"

for svc in $SERVICES; do
echo "Deploying $svc:$SHORT_SHA to staging..."
kubectl set image deployment/$svc \
$svc=${{ env.IMAGE_PREFIX }}/$svc:$SHORT_SHA \
-n grainguard-staging --record || echo "Skipping $svc (not found)"
done

- name: Wait for rollout
run: |
SERVICES="gateway bff ingest-service cassandra-writer read-model-builder"
for svc in $SERVICES; do
echo "Waiting for $svc..."
kubectl rollout status deployment/$svc -n grainguard-staging --timeout=300s || true
done

- name: Smoke test staging
run: |
GATEWAY_URL="${{ secrets.STAGING_GATEWAY_URL }}"
INGEST_URL="${{ secrets.STAGING_INGEST_URL }}"

echo "Checking gateway health..."
curl -sf "$GATEWAY_URL/health" | jq .

echo "Checking gateway readiness..."
curl -sf "$GATEWAY_URL/health/ready" | jq .

echo "Checking ingest health..."
curl -sf "$INGEST_URL/health" | jq .

# ─── Deploy to production (manual approval) ─────────────────
deploy-production:
name: Deploy to Production
runs-on: ubuntu-latest
needs: [deploy-staging]
environment: production

steps:
- uses: actions/checkout@v4

- name: Configure kubectl
uses: azure/setup-kubectl@v3
with:
version: "v1.29.0"

- name: Set kubeconfig
run: |
mkdir -p $HOME/.kube
echo "${{ secrets.KUBECONFIG_PRODUCTION }}" | base64 -d > $HOME/.kube/config
chmod 600 $HOME/.kube/config

- name: Deploy with rolling update
run: |
SHORT_SHA=$(echo "${{ github.sha }}" | head -c 7)
SERVICES="gateway bff ingest-service jobs-worker cassandra-writer \
read-model-builder telemetry-service saga-orchestrator \
cdc-transformer dlq-reprocessor risk-engine workflow-alerts \
asset-registry"

for svc in $SERVICES; do
echo "Deploying $svc:$SHORT_SHA to production..."
kubectl set image deployment/$svc \
$svc=${{ env.IMAGE_PREFIX }}/$svc:$SHORT_SHA \
-n grainguard-prod --record || echo "Skipping $svc (not found)"
done

- name: Wait for rollout
run: |
SERVICES="gateway bff ingest-service cassandra-writer read-model-builder"
for svc in $SERVICES; do
echo "Waiting for $svc..."
kubectl rollout status deployment/$svc -n grainguard-prod --timeout=300s || true
done

- name: Production smoke test
run: |
GATEWAY_URL="${{ secrets.PROD_GATEWAY_URL }}"
curl -sf "$GATEWAY_URL/health/ready" | jq .

- name: Notify Slack
if: always()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "${{ job.status == 'success' && ':white_check_mark:' || ':x:' }} Production deploy *${{ job.status }}* — `${{ github.sha }}` by ${{ github.actor }}\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_DEPLOY_WEBHOOK }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
119 changes: 119 additions & 0 deletions .github/workflows/chaos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
name: Chaos Tests

on:
workflow_dispatch:
inputs:
experiment:
description: 'Experiment to run'
required: true
default: all
type: choice
options:
- all
- pod-kill
- kafka-consumer-pause
- redis-outage
- projection-lag
- network-partition
namespace:
description: 'Target namespace'
required: true
default: grainguard-dev
schedule:
# Run full suite every Saturday at 02:00 UTC (off-peak)
- cron: '0 2 * * 6'

env:
NAMESPACE: ${{ github.event.inputs.namespace || 'grainguard-dev' }}

jobs:
chaos:
name: Chaos — ${{ github.event.inputs.experiment || 'all' }}
runs-on: ubuntu-latest
timeout-minutes: 30

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Configure kubectl
uses: azure/setup-kubectl@v3
with:
version: 'v1.29.0'

- name: Set kubeconfig
run: |
mkdir -p $HOME/.kube
echo "${{ secrets.KUBECONFIG_DEV }}" | base64 -d > $HOME/.kube/config
chmod 600 $HOME/.kube/config
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

- name: Install Chaos Toolkit
run: |
pip install --quiet \
chaostoolkit==1.19.0 \
chaostoolkit-kubernetes==0.26.4 \
chaostoolkit-verification==0.3.0

- name: Make scripts executable
run: chmod +x tests/chaos/*.sh

- name: Run — all experiments
if: ${{ github.event.inputs.experiment == 'all' || github.event_name == 'schedule' }}
env:
NAMESPACE: ${{ env.NAMESPACE }}
KAFKA_BOOTSTRAP: kafka:9092
GATEWAY_URL: ${{ secrets.CHAOS_GATEWAY_URL }}
PROMETHEUS_URL: ${{ secrets.CHAOS_PROMETHEUS_URL }}
TEST_JWT: ${{ secrets.CHAOS_TEST_JWT }}
run: bash tests/chaos/run-all.sh

- name: Run — pod-kill
if: ${{ github.event.inputs.experiment == 'pod-kill' }}
run: chaos run tests/chaos/pod-kill.yaml
Comment thread
coderabbitai[bot] marked this conversation as resolved.

- name: Run — kafka-consumer-pause
if: ${{ github.event.inputs.experiment == 'kafka-consumer-pause' }}
env:
NAMESPACE: ${{ env.NAMESPACE }}
KAFKA_BOOTSTRAP: kafka:9092
run: bash tests/chaos/kafka-consumer-pause.sh

- name: Run — redis-outage
if: ${{ github.event.inputs.experiment == 'redis-outage' }}
env:
NAMESPACE: ${{ env.NAMESPACE }}
GATEWAY_URL: ${{ secrets.CHAOS_GATEWAY_URL }}
TEST_JWT: ${{ secrets.CHAOS_TEST_JWT }}
run: bash tests/chaos/redis-outage.sh

- name: Run — projection-lag
if: ${{ github.event.inputs.experiment == 'projection-lag' }}
env:
NAMESPACE: ${{ env.NAMESPACE }}
KAFKA_BOOTSTRAP: kafka:9092
PROMETHEUS_URL: ${{ secrets.CHAOS_PROMETHEUS_URL }}
run: bash tests/chaos/projection-lag.sh

- name: Run — network-partition
if: ${{ github.event.inputs.experiment == 'network-partition' }}
run: chaos run tests/chaos/network-partition.yaml
Comment thread
coderabbitai[bot] marked this conversation as resolved.

- name: Upload chaos logs
if: always()
uses: actions/upload-artifact@v4
with:
name: chaos-results-${{ github.run_number }}
path: tests/chaos/results/
retention-days: 30
Comment thread
coderabbitai[bot] marked this conversation as resolved.

- name: Notify Slack on failure
if: failure()
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": ":fire: Chaos experiment *${{ github.event.inputs.experiment || 'all' }}* FAILED on `${{ env.NAMESPACE }}` — <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_CHAOS_WEBHOOK }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
Loading
Loading