diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 4242113..0000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,29 +0,0 @@ -## Description - -Please include a summary of the changes and the related issue. List any dependencies that are required for this change. - -Fixes # (issue) - -## Type of Change - -- [ ] Bug fix (non-breaking change which fixes an issue) -- [ ] New feature (non-breaking change which adds functionality) -- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) -- [ ] Documentation update - -## How Has This Been Tested? - -Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. - -- [ ] Test A -- [ ] Test B - -## Checklist - -- [ ] My code follows the style guidelines of this project -- [ ] I have performed a self-review of my code -- [ ] I have commented my code, particularly in hard-to-understand areas -- [ ] I have made corresponding changes to the documentation -- [ ] My changes generate no new warnings -- [ ] Any dependent changes have been merged and published in downstream modules -- [ ] I have checked my code and corrected any misspellings diff --git a/.github/workflows/AWS_CREATION_PIPELINE.yml b/.github/workflows/AWS_CREATION_PIPELINE.yml new file mode 100644 index 0000000..fc5465b --- /dev/null +++ b/.github/workflows/AWS_CREATION_PIPELINE.yml @@ -0,0 +1,51 @@ +name: AWS RESOURCES CREATION PIPELINE +on: + push: + branches: [ main, nanlabs_challenge] + + +jobs: + terraform: + name: Terraform + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v2 + with: + terraform_version: 1.9.8 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Terraform Init + run: terraform init -backend-config=../backend.hcl --reconfigure + working-directory: ./aws_resources + + - name: Terraform Format Check + run: terraform fmt -check -recursive + working-directory: ./aws_resources + + - name: Terraform Validate + run: terraform validate + working-directory: ./aws_resources + + - name: Terraform Plan + env: + TF_VAR_db_user: ${{ secrets.DB_USER }} + TF_VAR_db_password: ${{ secrets.DB_PASSWORD }} + TF_VAR_db_name : ${{ secrets.DB_NAME }} + run: terraform plan -out=tfplan + working-directory: ./aws_resources + + - name: Terraform Apply + if: github.ref == 'refs/heads/nanlabs_challenge' && github.event_name == 'push' + run: terraform apply -auto-approve tfplan + working-directory: ./aws_resources diff --git a/.github/workflows/AWS_DESTROY_RESOURCES_AWS.yml b/.github/workflows/AWS_DESTROY_RESOURCES_AWS.yml new file mode 100644 index 0000000..48b4673 --- /dev/null +++ b/.github/workflows/AWS_DESTROY_RESOURCES_AWS.yml @@ -0,0 +1,44 @@ +name: AWS RESOURCES DESTRUCTION PIPELINE +on: + workflow_dispatch: + +jobs: + terraform: + name: Terraform + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v2 + with: + terraform_version: 1.9.8 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Terraform Init + run: terraform init -backend-config=../backend.hcl --reconfigure + working-directory: ./aws_resources + + - name: Terraform Plan + env: + TF_VAR_db_user: ${{ secrets.DB_USER }} + TF_VAR_db_password: ${{ secrets.DB_PASSWORD }} + TF_VAR_db_name: ${{ secrets.DB_NAME }} + run: terraform plan -out=tfplan -input=false + working-directory: ./aws_resources + + - name: Terraform Destroy + env: + TF_VAR_db_user: ${{ secrets.DB_USER }} + TF_VAR_db_password: ${{ secrets.DB_PASSWORD }} + TF_VAR_db_name: ${{ secrets.DB_NAME }} + run: terraform destroy -auto-approve -input=false + working-directory: ./aws_resources diff --git a/.github/workflows/PRECOMMIT.yml b/.github/workflows/PRECOMMIT.yml new file mode 100644 index 0000000..4222679 --- /dev/null +++ b/.github/workflows/PRECOMMIT.yml @@ -0,0 +1,48 @@ +name: TERRAFORM PRECOMMIT HOOKS + +on: + pull_request: + push: + branches: [ main, develop, feature/**, nanlabs_challenge] + +jobs: + precommit: + runs-on: ubuntu-latest + name: Run Terraform Pre-Commit Checks + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Python 3.9.17 + uses: actions/setup-python@v4 + with: + python-version: '3.9.17' + + - name: Install Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + + - name: Install TFLint + run: | + curl -s https://raw.githubusercontent.com/terraform-linters/tflint/master/install_linux.sh | bash + + - name: Install terraform-docs + run: | + curl -Lo terraform-docs.tar.gz https://github.com/terraform-docs/terraform-docs/releases/download/v0.17.0/terraform-docs-v0.17.0-linux-amd64.tar.gz + tar -xzf terraform-docs.tar.gz + sudo mv terraform-docs /usr/local/bin/ + terraform-docs --version + + - name: Install pre-commit + run: pip install pre-commit + + - name: Cache pre-commit + uses: actions/cache@v4 + with: + path: ~/.cache/pre-commit + key: ${{ runner.os }}-precommit-${{ hashFiles('.pre-commit-config.yaml') }} + + - name: Run pre-commit hooks + run: pre-commit run --all-files --show-diff-on-failure \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ebb62df --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: + - repo: https://github.com/antonbabenko/pre-commit-terraform + rev: v1.88.0 + hooks: + - id: terraform_fmt + - id: terraform_validate + - id: terraform_tflint + args: + - "--args=--call-module-type=all" \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 48cc25d..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2025 NaN Labs - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index 1844c9e..588bf73 100644 --- a/README.md +++ b/README.md @@ -1,91 +1,390 @@ -# πŸš€ Cloud Data Engineer Challenge +# 🌩️ Cloud Data Engineer Challenge -Welcome to the **Cloud Data Engineer Challenge!** πŸŽ‰ This challenge is designed to evaluate your ability to work with **Infrastructure as Code (IaC), AWS data services, and data engineering workflows**, ensuring efficient data ingestion, storage, and querying. +## 🧭 Overview -> [!NOTE] -> You can use **any IaC tool of your choice** (Terraform preferred, but alternatives are allowed). If you choose a different tool or a combination of tools, **justify your decision!** +This project builds an **AWS-based data ingestion and processing architecture** using **S3 β†’ Lambda β†’ RDS (PostgreSQL + PostGIS) β†’ API Gateway**. +All infrastructure is defined and deployed using **Terraform**, featuring **KMS encryption**, **CloudWatch monitoring**, **automatic RDS backups**, and **on-demand view creation** via Lambda. -## ⚑ Challenge Overview +--- -Your task is to deploy the following infrastructure on AWS: +## ⚠️ Critical Configuration Notes -> 🎯 **Key Objectives:** +The **bucket configuration** is a critical part of this project. Two different S3 buckets are required: -- **An S3 bucket** that will receive data files as new objects. -- **A Lambda function** that is triggered by a `PUT` event in the S3 bucket. -- **The Lambda function must:** - - Process the ingested data and perform a minimal aggregation. - - Store the processed data in a **PostgreSQL database with PostGIS enabled**. - - Expose an API Gateway endpoint (`GET /aggregated-data`) to query and retrieve the aggregated data. -- **A PostgreSQL database** running in a private subnet with PostGIS enabled. -- **Networking must include:** VPC, public/private subnets, and security groups. -- **The Lambda must be in a private subnet** and use a NAT Gateway in a public subnet for internet access 🌍 -- **CloudWatch logs** should capture Lambda execution details and possible errors. +1. **Terraform State Bucket** – used to store the Terraform state file (`terraform.tfstate`). +2. **Data Upload Bucket** – used to upload files that trigger the Lambda function. -> [!IMPORTANT] -> Ensure that your solution is modular, well-documented, and follows best practices for security and maintainability. +### πŸͺ£ Terraform State Bucket +- This bucket is configured in the `backend.hcl` file. +- If you want to **change the bucket name**: + - First, **validate that the bucket does not already exist** in your AWS account. + - Then, **create the new bucket** following the same steps described in the [Prerequisites](#-prerequisites) section. + - Finally, update the `bucket` value in the `backend.hcl` file to match the new name. -## πŸ“Œ Requirements +### πŸ“‚ Data Upload Bucket +- This bucket is where files are uploaded to trigger the Lambda function. +- If you want to **change the variable name or bucket value**, update it in: + ``` + bucket_module β†’ variables.tf β†’ default value + ``` +- Before changing it, **make sure that the bucket exists**, as explained in the prerequisites section. +- Terraform uses this variable to connect the S3 event notification with the Lambda function, so the name must match an existing bucket. -### πŸ›  Tech Stack +Incorrect configuration of either bucket will prevent Terraform from deploying or the Lambda from being triggered correctly. -> ⚑ **Must Include:** +--- -- **IaC:** Any tool of your choice (**Terraform preferred**, but others are allowed if justified). -- **AWS Services:** S3, Lambda, API Gateway, CloudWatch, PostgreSQL with PostGIS (RDS or self-hosted on EC2). +## 🧰 Prerequisites -### πŸ“„ Expected Deliverables +### 1️⃣ AWS CLI Installation -> πŸ“₯ **Your submission must be a Pull Request that includes:** +The **AWS CLI** is essential for managing credentials and configuring your environment. Follow these steps for installation: -- **An IaC module** that deploys the entire architecture. -- **A `README.md`** with deployment instructions and tool selection justification. -- **A working API Gateway endpoint** that returns the aggregated data stored in PostgreSQL. -- **CloudWatch logs** capturing Lambda execution details. -- **Example input files** to trigger the data pipeline (placed in an `examples/` directory). -- **A sample event payload** (JSON format) to simulate the S3 `PUT` event. +**macOS Installation:** +```bash +brew install awscli +``` -> [!TIP] -> Use the `docs` folder to store any additional documentation or diagrams that help explain your solution. -> Mention any assumptions or constraints in your `README.md`. +**Verify Installation:** +```bash +aws --version +``` -## 🌟 Nice to Have +**Windows Installation:** +For Windows users, refer to the official [AWS CLI Installation Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). -> πŸ’‘ **Bonus Points For:** +--- -- **Data Quality & Validation**: Implementing **schema validation before storing data in PostgreSQL**. -- **Indexing & Query Optimization**: Using **PostGIS spatial indexing** for efficient geospatial queries. -- **Monitoring & Alerts**: Setting up **AWS CloudWatch Alarms** for S3 event failures or Lambda errors. -- **Automated Data Backups**: Creating periodic **database backups to S3** using AWS Lambda or AWS Backup. -- **GitHub Actions for validation**: Running **`terraform fmt`, `terraform validate`**, or equivalent for the chosen IaC tool. -- **Pre-commit hooks**: Ensuring linting and security checks before committing. -- **Docker for local testing**: Using **Docker Compose to spin up**: - - Running a local PostgreSQL database with PostGIS to simulate the cloud environment πŸ›  - - Providing a local S3-compatible service (e.g., MinIO) to test file ingestion before deployment πŸ–₯ +### 2️⃣ Terraform Environment Configuration -> [!TIP] -> Looking for inspiration or additional ideas to earn extra points? Check out our **[Awesome NaNLABS repository](https://github.com/nanlabs/awesome-nan)** for reference projects and best practices! πŸš€ +**Check if the S3 Bucket Exists:** +```bash +aws s3api head-bucket --bucket your_bucket +``` -## πŸ“₯ Submission Guidelines +**Create DynamoDB Table for Terraform State Locking:** +```bash +aws dynamodb create-table --table-name terraform-lock-table --attribute-definitions AttributeName=LockID,AttributeType=S --key-schema AttributeName=LockID,KeyType=HASH --billing-mode PAY_PER_REQUEST --region your_region +``` -> πŸ“Œ **Follow these steps to submit your solution:** +**Create S3 Bucket for Terraform State:** +```bash +aws s3api create-bucket --bucket your_bucket --region your_region --create-bucket-configuration LocationConstraint=your_region +``` -1. **Fork this repository.** -2. **Create a feature branch** for your implementation. -3. **Commit your changes** with meaningful commit messages. -4. **Open a Pull Request** following the provided template. -5. **Our team will review** and provide feedback. +**Apply Bucket Policies:** +```bash +aws s3api put-public-access-block --bucket your_bucket --public-access-block-configuration BlockPublicAcls=true,IgnorePublicAcls=true,BlockPublicPolicy=true,RestrictPublicBuckets=true +``` -## βœ… Evaluation Criteria +--- -> πŸ” **What we'll be looking at:** +### 3️⃣ Backend Configuration for Terraform State -- **Correctness and completeness** of the **data pipeline**. -- **Use of best practices for event-driven processing** (S3 triggers, Lambda execution). -- **Data transformation & aggregation logic** implemented in Lambda. -- **Optimization for geospatial queries** using PostGIS. -- **Data backup & integrity strategies** (optional, e.g., automated S3 backups). -- **CI/CD automation using GitHub Actions and pre-commit hooks** (optional). -- **Documentation clarity**: Clear explanation of data flow, transformation logic, and infrastructure choices. +Update the `backend.hcl` file with your Terraform backend configuration: +```hcl +bucket = "your-bucket" +``` +--- -## 🎯 **Good luck and happy coding!** πŸš€ +## πŸ“¦ Triggering the Lambda Function via S3 + +Once the Terraform infrastructure is deployed, you can **trigger the Lambda function automatically** by uploading files to the configured **S3 bucket**. + +Each time a new object is uploaded, the **S3 event notification** will invoke the Lambda to process the data and store it in the PostgreSQL database. + +### πŸ“ Use the `sample_files` Folder + +This repository includes a directory named **`sample_files/`**, which contains example files to test the Lambda integration. + +#### πŸ§ͺ Steps to Trigger the Lambda + +1. Identify your S3 bucket name from Terraform outputs or the AWS Console. +2. Upload a sample file from the `sample_files` directory: + ```bash + aws s3 cp sample_files/example_data.csv s3://your_bucket_name/ + ``` +3. The **S3 ObjectCreated:Put** event will automatically **invoke the Lambda**. +4. The Lambda function reads and processes the file, then loads the resulting data into **RDS (PostgreSQL)**. +5. Check Lambda execution logs in: + ``` + Amazon CloudWatch β†’ Log groups β†’ /aws/lambda/your_lambda_name + ``` +6. Also the lambda can be triggered loading the files manually using AWS Console directly + +#### πŸ“ Notes + +- Only `.csv` files should be uploaded. +- The schema (columns, types) will be validated for the lambda. + + +--- + + +### βš™οΈ 4️⃣ Required Environment Variables + +Before running Terraform or executing GitHub Actions pipelines, make sure the following **environment variables** are configured. +If they are missing, the **pipeline will fail** during deployment or Lambda provisioning. + +| Variable | Description | +|-----------|--------------| +| `DB_NAME` | Variable used in the AWS Lambda for PosgreSQL database creation | +| `DB_PASSWORD` | Password used in RDS module for default database access | +| `AWS_ACCESS_KEY_ID` | AWS access key ID for authentication | +| `AWS_SECRET_ACCESS_KEY` | AWS secret access key associated with the account | + +**These variables must be configured under the repository’s:** +πŸ‘‰ `Settings β†’ Secrets and variables β†’ Actions β†’ Repository secrets` + +The GitHub Actions workflow automatically loads these variables at runtime to authenticate and provision AWS resources. + +--- + +## βš™οΈ Architecture + +### πŸ”Ή Key Components + +- **Amazon S3** + Receives CSV data files and triggers the **Lambda** on `ObjectCreated:Put`. + +- **AWS Lambda** + Core processing unit performing: + - **Data ingestion**: Reads S3 files and validates schemas. + - **Transformation**: Converts latitude/longitude into geospatial `geometry(Point)` using `ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)`. + - **Database loading**: Inserts rows into PostgreSQL `crime_incidents`. + - **Backup automation**: Invokes `boto3.rds.create_db_snapshot` to create **RDS snapshots** after data insertion. + - **View creation**: Ensures existence of **`v_crime_summary`**, an analytical view for aggregated crime reporting. + +- **Amazon RDS (PostgreSQL + PostGIS)** + Runs in a private subnet with **PostGIS extensions** (`CREATE EXTENSION postgis;`). + Secrets (hostname, username, password) are securely retrieved from **AWS Secrets Manager**. + +- **Amazon API Gateway** + REST interface exposing `/postgresql-api-conn-path`, allowing real-time querying of both `crime_incidents` and `v_crime_summary`. + +- **Networking (VPC)** + Private subnet for Lambda with NAT Gateway for internet access and proper **Security Groups** to reach RDS. + +- **CloudWatch & SNS** + Logs all Lambda runs, monitors failures, and sends alerts through SNS. + +--- + +## 🧱 Repository Structure + +``` +aws_resources/ +β”œβ”€β”€ bucket_module/ # S3 bucket + KMS encryption +β”œβ”€β”€ lambda_module/ # Lambda, IAM, Docker build, and alerts +β”‚ β”œβ”€β”€ iam.tf +β”‚ β”œβ”€β”€ lambda.tf +β”‚ └── resources/python/aws_lambda/ +β”‚ β”œβ”€β”€ lambda_function.py # Lambda handler entry point +β”‚ β”œβ”€β”€ get_connection.py # Validates DB connection and creates the target table +β”‚ β”œβ”€β”€ get_transformation.py # Inserts data and builds view v_crime_summary +β”‚ β”œβ”€β”€ get_response.py # Dynamically selects data and generates JSON response +β”‚ β”œβ”€β”€ get.py # Builds the body structure for GET API responses +β”‚ └── utils.py # Common utilities (backup and secret) +β”œβ”€β”€ api_gateway_module/ # REST API Gateway (AWS_PROXY integration) +β”œβ”€β”€ network_module/ # VPC, subnets, NAT Gateway, and Security Groups +β”œβ”€β”€ rds_module/ # PostgreSQL RDS + AWS Secrets Manager integration +β”œβ”€β”€ providers.tf # AWS provider definition +β”œβ”€β”€ backend.hcl # Remote backend configuration (S3 + DynamoDB) +└── .pre-commit-config.yaml # Pre-commit hooks for Terraform validation +``` + +--- + +## πŸ“‘ Data Flow + +### 1️⃣ Ingestion from S3 + +When a CSV is uploaded, S3 triggers the Lambda (`ObjectCreated:Put` event). +Lambda: +- Reads file from S3. +- Validates schema using Pandas. +- Converts coordinates to PostGIS geometry. +- Inserts into `crime_incidents`. +- Creates or replaces the summary view. +- Creates the DB Backup implementing a dynamic snapshot + +View created: +```sql +CREATE OR REPLACE VIEW v_crime_summary AS +SELECT + offense, + district, + COUNT(*) AS total, + ST_Collect(geom) AS geom_cluster +FROM crime_incidents +WHERE geom IS NOT NULL +GROUP BY offense, district; +``` + +--- + +### 2️⃣ API Gateway Query + +**Request:** +``` +GET /https://your_api_gateway_endpoint_path?table=crime_incidents +``` + +**Real Response Example:** +```json +[ + { + "id": 384, + "ccn": "25141990", + "report_date": 1758066539000, + "shift": "EVENING", + "method": "OTHERS", + "offense": "ROBBERY", + "block": "1300 - 1399 BLOCK OF 2ND STREET NE", + "ward": "6.0", + "district": "5.0", + "psa": "501.0", + "neighborhood_cluster": "Cluster 25", + "latitude": 38.9078254892, + "longitude": -77.0035141787, + "geom": "0101000020E6100000AFA58893394053C06B4B29A033744340" + } +] +``` + +**Querying the view:** +``` +GET /https://your_api_gateway_endpoint_path?table=crime_summary +``` +**Real Response Example:** +```json +[ + { + "offense": "ASSAULT W/DANGEROUS WEAPON", + "district": "3.0", + "total": 2, + "geom_cluster": "0104000020E610000002000000010100000057AD2053944053C053F46BA32D74434001010000004FDE4198AD4053C0584EB940ED754340" + } +] +``` + +## 🧾 Example Lambda Event Payload (S3 Trigger) + +```json +{ + "Records": [ + { + "eventVersion": "2.1", + "eventSource": "aws:s3", + "awsRegion": "us-east-2", + "eventTime": "2025-10-03T16:09:50.130Z", + "eventName": "ObjectCreated:Put", + "userIdentity": {"principalId": "ACF3QWLPS3VUX"}, + "requestParameters": {"sourceIPAddress": "191.95.19.221"}, + "responseElements": { + "x-amz-request-id": "RGCMFQ73Z5425D4Y", + "x-amz-id-2": "dV1TSmaGWQzgc4ezL1QaIx00s13H9VFUFHXTcplY1O8VqFaSw7Aj/FfYe3E+N6AoYGiMEW1J1ywKo/42G1DaJTCIh5mMnEIZGJ6pd+7SHu4=" + }, + "s3": { + "s3SchemaVersion": "1.0", + "configurationId": "tf-s3-lambda-20251003152624031900000001", + "bucket": { + "name": "mv-pr-dt", + "arn": "arn:aws:s3:::mv-pr-dt" + }, + "object": { + "key": "Crime_Incidents_part_1.csv", + "size": 1571, + "eTag": "55f777571307999c7d0fee2abe75b5fc", + "sequencer": "0068DFF54E183036C3" + } + } + } + ] +} +``` + +--- + +## πŸš€ Deployment & Automation + +### 🧩 Terraform Commands + +```bash +terraform init -backend-config=backend.hcl +terraform fmt -recursive +terraform validate +terraform plan +terraform apply -auto-approve +terraform destroy -auto-approve +``` + +--- + +### πŸ€– GitHub Actions CI/CD + +Located in `.github/workflows/`: + +#### `AWS_CREATION_PIPELINE.yml` +- Runs on push to `nanlabs_challenge`. +- Executes Terraform validation and deployment. + +#### `AWS_DESTROY_PIPELINES.yml` +- Manual cleanup workflow for teardown. + +#### `PRECOMMIT.yml` +- Validate locally before committing: +```bash +pre-commit run --all-files --show-diff-on-failure +``` +Includes: +- `terraform_fmt` +- `terraform_validate` +- `terraform_tflint` +- `terraform_docs` + +--- + +## πŸ” Security & Monitoring + +| Component | Description | +|------------|--------------| +| **KMS** | Encrypts all S3 bucket objects | +| **Secrets Manager** | Stores DB credentials securely | +| **CloudWatch Logs** | Tracks Lambda execution logs | +| **SNS Notifications** | Sends alerts for failures | +| **CloudWatch Alarms** | Monitors Lambda error metrics | + +--- + +## 🧠 Best Practices Implemented + +- Modular Terraform architecture +- Automated **RDS backup** on ingestion +- Idempotent **view creation** for analytics +- Pre-commit & CI/CD enforcement +- Full observability with CloudWatch and SNS + +--- + +## πŸ› οΈ Troubleshooting + +| Issue | Possible Cause | Fix | +|--------|----------------|-----| +| Lambda not triggered | S3 event config issue | Check `aws_s3_bucket_notification` | +| DB connection error | Invalid Secrets Manager entry | Validate `postgresql_conn` | +| Snapshot not found | Missing IAM permission | Verify `RDSBackupAccess` policy | +| View missing | Lambda timeout or DB lock | Check CloudWatch logs | + +--- + +## βœ… Current Status + +| Module | Status | Description | +|---------|---------|-------------| +| **RDS** | βœ… | PostgreSQL + PostGIS operational | +| **Lambda** | βœ… | Ingestion, backup & view creation | +| **S3 Trigger** | βœ… | PUT event connected | +| **API Gateway** | βœ… | Query interface live | +| **CI/CD** | βœ… | Automated workflows | +| **Monitoring** | βœ… | CloudWatch & SNS active | \ No newline at end of file diff --git a/aws_resources/api_gateway_module/api_gateway.tf b/aws_resources/api_gateway_module/api_gateway.tf new file mode 100644 index 0000000..916974a --- /dev/null +++ b/aws_resources/api_gateway_module/api_gateway.tf @@ -0,0 +1,127 @@ +# Creates the main API Gateway REST API. +# This defines the root API container for both GET and POST methods. +resource "aws_api_gateway_rest_api" "t1_db_conn_api" { + name = "postgresql-conn" + description = "RDS postgresql API connection" + + endpoint_configuration { + types = ["REGIONAL"] + } +} + +# Creates a specific resource path under the root of the API. +resource "aws_api_gateway_resource" "t1_db_conn_api_path" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + parent_id = aws_api_gateway_rest_api.t1_db_conn_api.root_resource_id + path_part = "postgresql-api-conn-path" +} + +# Defines the POST HTTP method for the API resource. +resource "aws_api_gateway_method" "t1_db_conn_api_post" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + resource_id = aws_api_gateway_resource.t1_db_conn_api_path.id + http_method = "POST" + authorization = "NONE" +} + +# Defines the expected 200 OK response for POST requests. +resource "aws_api_gateway_method_response" "response_200_post" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + resource_id = aws_api_gateway_resource.t1_db_conn_api_path.id + http_method = aws_api_gateway_method.t1_db_conn_api_post.http_method + status_code = "200" + + response_models = { + "application/json" = "Empty" + } +} + +# Integrates the POST method with the target Lambda function using AWS_PROXY. +# AWS_PROXY means API Gateway passes the full request directly to Lambda. +resource "aws_api_gateway_integration" "integration_post" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + resource_id = aws_api_gateway_resource.t1_db_conn_api_path.id + http_method = aws_api_gateway_method.t1_db_conn_api_post.http_method + type = "AWS_PROXY" + integration_http_method = "POST" + uri = var.invoke_arn +} + +# Defines the GET HTTP method for the same API resource. +resource "aws_api_gateway_method" "t1_db_conn_api_get" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + resource_id = aws_api_gateway_resource.t1_db_conn_api_path.id + http_method = "GET" + authorization = "NONE" +} + +# Defines the expected 200 OK response for GET requests. +resource "aws_api_gateway_method_response" "response_200_get" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + resource_id = aws_api_gateway_resource.t1_db_conn_api_path.id + http_method = aws_api_gateway_method.t1_db_conn_api_get.http_method + status_code = "200" + + response_models = { + "application/json" = "Empty" + } +} + +# Integrates the GET method with the same Lambda function using AWS_PROXY. +# The integration method must remain POST when using AWS_PROXY. +resource "aws_api_gateway_integration" "integration_get" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + resource_id = aws_api_gateway_resource.t1_db_conn_api_path.id + http_method = aws_api_gateway_method.t1_db_conn_api_get.http_method + type = "AWS_PROXY" + integration_http_method = "POST" + uri = var.invoke_arn +} + +# Grants API Gateway permission to invoke the Lambda function. +resource "aws_lambda_permission" "allow_apigateway" { + statement_id = "AllowAPIGatewayInvoke" + action = "lambda:InvokeFunction" + function_name = var.function_name + principal = "apigateway.amazonaws.com" + source_arn = "${aws_api_gateway_rest_api.t1_db_conn_api.execution_arn}/dev/*" +} + +# Deployment for POST and GET +resource "aws_api_gateway_deployment" "api_deployment" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + + depends_on = [ + # POST + aws_api_gateway_method.t1_db_conn_api_post, + aws_api_gateway_integration.integration_post, + aws_api_gateway_method_response.response_200_post, + + # GET + aws_api_gateway_method.t1_db_conn_api_get, + aws_api_gateway_integration.integration_get, + aws_api_gateway_method_response.response_200_get + ] +} + +# Creates the "dev" stage for the API Gateway deployment. +# This defines the URL stage used in the final endpoint. +resource "aws_api_gateway_stage" "postgresql_api_conn_stage" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + deployment_id = aws_api_gateway_deployment.api_deployment.id + stage_name = "dev" +} + +# Caching and throttling +resource "aws_api_gateway_method_settings" "api_method_settings" { + rest_api_id = aws_api_gateway_rest_api.t1_db_conn_api.id + stage_name = aws_api_gateway_stage.postgresql_api_conn_stage.stage_name + method_path = "*/*" + + settings { + cache_data_encrypted = false + cache_ttl_in_seconds = 0 + throttling_burst_limit = 500 + throttling_rate_limit = 1000 + } +} diff --git a/aws_resources/api_gateway_module/providers.tf b/aws_resources/api_gateway_module/providers.tf new file mode 100644 index 0000000..f56e4e8 --- /dev/null +++ b/aws_resources/api_gateway_module/providers.tf @@ -0,0 +1,9 @@ +terraform { + required_version = ">= 1.5.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.60.0" + } + } +} \ No newline at end of file diff --git a/aws_resources/api_gateway_module/variables.tf b/aws_resources/api_gateway_module/variables.tf new file mode 100644 index 0000000..c88ab79 --- /dev/null +++ b/aws_resources/api_gateway_module/variables.tf @@ -0,0 +1,7 @@ +variable "invoke_arn" { + type = string +} + +variable "function_name" { + type = string +} diff --git a/aws_resources/bucket_module/bucket.tf b/aws_resources/bucket_module/bucket.tf new file mode 100644 index 0000000..65dd0a9 --- /dev/null +++ b/aws_resources/bucket_module/bucket.tf @@ -0,0 +1,62 @@ +data "aws_caller_identity" "current" {} + +# Creating the KMS key resource +resource "aws_kms_key" "dts_kms_key" { + description = "Key for encryption" + enable_key_rotation = true + customer_master_key_spec = "SYMMETRIC_DEFAULT" +} + +# Activating KMS key policy +resource "aws_kms_key_policy" "bucket_kms_key" { + key_id = aws_kms_key.dts_kms_key.id + policy = jsonencode({ + Version = "2012-10-17" + Id = "key-default-1" + Statement = [ + { + Sid = "Enable IAM User Permissions" + Effect = "Allow" + Principal = { + AWS = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root" + } + Action = "kms:*" + Resource = "*" + } + ] + }) +} + +# Creating KMS key alias +resource "aws_kms_alias" "dts_kms_alias" { + name = "alias/mv-pr-dt-key" + target_key_id = aws_kms_key.dts_kms_key.key_id +} + +# Creating the S3 bucket +resource "aws_s3_bucket" "bucket_creation" { + bucket = var.curated_bucket + force_destroy = true +} + +# Setting bucket access +resource "aws_s3_bucket_public_access_block" "public_access_block" { + bucket = aws_s3_bucket.bucket_creation.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +# Setting up bucket encryption with the KMS key +resource "aws_s3_bucket_server_side_encryption_configuration" "ss_kms_key" { + bucket = aws_s3_bucket.bucket_creation.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + kms_master_key_id = aws_kms_key.dts_kms_key.arn + } + } +} \ No newline at end of file diff --git a/aws_resources/bucket_module/outputs.tf b/aws_resources/bucket_module/outputs.tf new file mode 100644 index 0000000..03fd949 --- /dev/null +++ b/aws_resources/bucket_module/outputs.tf @@ -0,0 +1,15 @@ +output "target_bucket" { + value = aws_s3_bucket.bucket_creation.bucket +} + +output "target_key" { + value = aws_kms_key.dts_kms_key.arn +} + +output "bucket_id" { + value = aws_s3_bucket.bucket_creation.id +} + +output "bucket_arn" { + value = aws_s3_bucket.bucket_creation.arn +} diff --git a/aws_resources/bucket_module/providers.tf b/aws_resources/bucket_module/providers.tf new file mode 100644 index 0000000..f56e4e8 --- /dev/null +++ b/aws_resources/bucket_module/providers.tf @@ -0,0 +1,9 @@ +terraform { + required_version = ">= 1.5.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.60.0" + } + } +} \ No newline at end of file diff --git a/aws_resources/bucket_module/variables.tf b/aws_resources/bucket_module/variables.tf new file mode 100644 index 0000000..f50f1e8 --- /dev/null +++ b/aws_resources/bucket_module/variables.tf @@ -0,0 +1,5 @@ +variable "curated_bucket" { + description = "Curated bucket" + type = string + default = "mv-pr-dt" +} diff --git a/aws_resources/lambda_module/iam.tf b/aws_resources/lambda_module/iam.tf new file mode 100644 index 0000000..e735fd0 --- /dev/null +++ b/aws_resources/lambda_module/iam.tf @@ -0,0 +1,96 @@ +# Lambda IAM Role +resource "aws_iam_role" "iam_dev_role_pr_mv" { + name = "iam_for_dev_pr_mv" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + } + ] + }) +} + +# Lambda Policy Document +data "aws_iam_policy_document" "pipeline_dev_policy_pr_mv" { + statement { + sid = "CloudWatchLogging" + effect = "Allow" + actions = [ + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "logs:GetLogEvents", + "logs:FilterLogEvents", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "logs:PutRetentionPolicy", + "logs:DeleteLogGroup", + "logs:DeleteLogStream" + ] + resources = ["*"] + } + + statement { + sid = "S3AndKMSAccess" + effect = "Allow" + actions = [ + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:CreateBucket", + "s3:DeleteBucket", + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject", + "kms:*" + ] + resources = ["*"] + } + + statement { + sid = "SecretsManagerAccess" + effect = "Allow" + actions = [ + "secretsmanager:GetSecretValue" + ] + resources = ["*"] + } + + statement { + sid = "EC2NetworkInterfaceAccess" + effect = "Allow" + actions = [ + "ec2:CreateNetworkInterface", + "ec2:DescribeNetworkInterfaces", + "ec2:DeleteNetworkInterface", + "ec2:AssignPrivateIpAddresses", + "ec2:UnassignPrivateIpAddresses" + ] + resources = ["*"] + } + + statement { + sid = "RDSBackupAccess" + effect = "Allow" + actions = [ + "rds:CreateDBSnapshot", + "rds:DescribeDBSnapshots", + "rds:DeleteDBSnapshot", + "rds:CopyDBSnapshot", + "rds:DescribeDBInstances" + ] + resources = ["*"] + } +} + +# Attach Inline Policy to Role +resource "aws_iam_role_policy" "lambda_permissions" { + name = "lambda_logging_with_layer" + role = aws_iam_role.iam_dev_role_pr_mv.name + policy = data.aws_iam_policy_document.pipeline_dev_policy_pr_mv.json +} diff --git a/aws_resources/lambda_module/lambda.tf b/aws_resources/lambda_module/lambda.tf new file mode 100644 index 0000000..392544e --- /dev/null +++ b/aws_resources/lambda_module/lambda.tf @@ -0,0 +1,165 @@ +# Connect with Docker unix socket +provider "docker" { + host = "unix:///var/run/docker.sock" +} + +# ECR repository creation +resource "aws_ecr_repository" "lambda_repository" { + name = "lambda-mv-pr-repository" + image_tag_mutability = "MUTABLE" + + image_scanning_configuration { + scan_on_push = true + } + + force_delete = true + + tags = { + Environment = "development" + } +} + +# AWS CLI login for ECR +resource "null_resource" "ecr_login" { + provisioner "local-exec" { + command = <