Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@ OPENROUTER_API_KEY=your_openrouter_key
BETTER_AUTH_SECRET=your_auth_secret
GOOGLE_CLIENT_ID=your_google_id
GOOGLE_CLIENT_SECRET=your_google_secret
R2_ACCOUNT_ID=...
R2_ACCESS_KEY_ID=...
R2_SECRET_ACCESS_KEY=...
R2_BUCKET_NAME=...
R2_PUBLIC_URL=...
SPACES_ENDPOINT=https://nyc3.digitaloceanspaces.com
SPACES_KEY=your_access_key
SPACES_SECRET=your_secret_key
SPACES_BUCKET=your_bucket_name
SPACES_REGION=nyc3
SPACES_PUBLIC_URL=https://your_bucket_name.nyc3.cdn.digitaloceanspaces.com
MAX_OCR_PAGES=12
MAX_FILE_SIZE_MB=20
POSTGRES_USER=user
POSTGRES_PASSWORD=password
POSTGRES_DB=pyqdb
18 changes: 13 additions & 5 deletions backend/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,19 @@ class Settings(BaseSettings):
GOOGLE_CLIENT_ID: str
GOOGLE_CLIENT_SECRET: str

R2_ACCOUNT_ID: str
R2_ACCESS_KEY_ID: str
R2_SECRET_ACCESS_KEY: str
R2_BUCKET_NAME: str
R2_PUBLIC_URL: str
# DigitalOcean Spaces
SPACES_ENDPOINT: str
SPACES_KEY: str
SPACES_SECRET: str
SPACES_BUCKET: str
SPACES_REGION: str = "nyc3"
SPACES_PUBLIC_URL: str

# OCR Settings
MAX_OCR_PAGES: int = 12

# Upload Settings
MAX_FILE_SIZE_MB: int = 20

# JWT Settings
JWT_SECRET_KEY: str
Expand Down
3 changes: 2 additions & 1 deletion backend/app/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from .routers import auth
from .routers import auth, resources
from .config import settings

app = FastAPI(title="PYQ Solver API")
Expand All @@ -16,6 +16,7 @@
)

app.include_router(auth.router, prefix="/api")
app.include_router(resources.router, prefix="/api")

@app.get("/")
async def root():
Expand Down
162 changes: 162 additions & 0 deletions backend/app/routers/resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, status
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from typing import List
import uuid
from ..database import get_db
from ..models.user import User
from ..models.resource import Resource
from ..schemas.resource import ResourceOut
from ..routers.auth import get_current_user
from ..services.storage import storage_service
from arq import create_pool
from ..config import settings
from arq.connections import RedisSettings

router = APIRouter(prefix="/resources", tags=["resources"])

@router.post("/", response_model=ResourceOut)
async def upload_resource(
type: str = Form(...),
file: UploadFile = File(...),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user)
):
if file.content_type not in ["application/pdf", "text/plain"]:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Only PDF and Text files are supported"
)

# Chunked read with size enforcement (CodeRabbit fix)
MAX_BYTES = settings.MAX_FILE_SIZE_MB * 1024 * 1024
CHUNK_SIZE = 1024 * 1024 # 1MB chunks
content = bytearray()
total_size = 0

while True:
chunk = await file.read(CHUNK_SIZE)
if not chunk:
break
total_size += len(chunk)
if total_size > MAX_BYTES:
raise HTTPException(
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"File too large. Maximum size allowed is {settings.MAX_FILE_SIZE_MB}MB"
)
content.extend(chunk)

# Generate unique filename for storage
ext = file.filename.split('.')[-1]
object_name = f"user_{current_user.id}/{uuid.uuid4()}.{ext}"

# Upload to DigitalOcean Spaces
file_url = storage_service.upload_file(content, object_name, file.content_type)
if not file_url:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to upload file to storage"
)

# Create DB record (not committed yet)
new_resource = Resource(
user_id=current_user.id,
filename=file.filename,
file_url=file_url,
type=type,
status="processing"
)
db.add(new_resource)
await db.flush() # Flush to get the ID but don't commit

# Enqueue background extraction task before committing DB
try:
redis = await create_pool(RedisSettings.from_dsn(settings.REDIS_URL))
await redis.enqueue_job('extraction_task', str(new_resource.id))

# Only commit if enqueue was successful
await db.commit()
await db.refresh(new_resource)
except Exception as e:
await db.rollback()
# Should also ideally delete the file from Spaces here if we were strict
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to queue background task: {str(e)}"
)

Comment thread
shubhamxdd marked this conversation as resolved.
return new_resource

@router.get("/", response_model=List[ResourceOut])
async def list_resources(
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user)
):
result = await db.execute(
select(Resource).where(Resource.user_id == current_user.id).order_by(Resource.created_at.desc())
)
return result.scalars().all()

@router.delete("/{resource_id}")
async def delete_resource(
resource_id: uuid.UUID,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user)
):
result = await db.execute(
select(Resource).where(Resource.id == resource_id, Resource.user_id == current_user.id)
)
resource = result.scalar_one_or_none()

if not resource:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Resource not found")

# Delete from Spaces
object_name = resource.file_url.replace(f"{settings.SPACES_PUBLIC_URL}/", "")
success = storage_service.delete_file(object_name)

if not success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to delete file from cloud storage. Database record preserved."
)

# Delete from DB only after storage is confirmed deleted
await db.delete(resource)
await db.commit()
Comment thread
shubhamxdd marked this conversation as resolved.

return {"message": "Resource deleted successfully"}

@router.post("/{resource_id}/retry", response_model=ResourceOut)
async def retry_extraction(
resource_id: uuid.UUID,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user)
):
result = await db.execute(
select(Resource).where(Resource.id == resource_id, Resource.user_id == current_user.id)
)
resource = result.scalar_one_or_none()

if not resource:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Resource not found")

# Update status to processing (don't commit yet)
resource.status = "processing"

# Re-enqueue background extraction task before committing DB
try:
redis = await create_pool(RedisSettings.from_dsn(settings.REDIS_URL))
await redis.enqueue_job('extraction_task', str(resource.id))

# Only commit if enqueue was successful
await db.commit()
await db.refresh(resource)
except Exception as e:
await db.rollback()
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to queue background task: {str(e)}"
)

return resource
20 changes: 20 additions & 0 deletions backend/app/schemas/resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pydantic import BaseModel, ConfigDict
from uuid import UUID
from datetime import datetime
from typing import Optional, List

class ResourceBase(BaseModel):
filename: str
type: str # notes | syllabus | past_paper | other

class ResourceOut(ResourceBase):
id: UUID
user_id: UUID
file_url: str
status: str # pending | processing | ready | failed
created_at: datetime

model_config = ConfigDict(from_attributes=True)

class ResourceList(BaseModel):
resources: List[ResourceOut]
42 changes: 42 additions & 0 deletions backend/app/services/storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import boto3
from botocore.exceptions import ClientError
from ..config import settings
import logging

class StorageService:
def __init__(self):
self.session = boto3.session.Session()
self.client = self.session.client(
's3',
region_name=settings.SPACES_REGION,
endpoint_url=settings.SPACES_ENDPOINT,
aws_access_key_id=settings.SPACES_KEY,
aws_secret_access_key=settings.SPACES_SECRET
)

def upload_file(self, file_content: bytes, object_name: str, content_type: str = 'application/pdf'):
try:
self.client.put_object(
Bucket=settings.SPACES_BUCKET,
Key=object_name,
Body=file_content,
ACL='public-read',
ContentType=content_type
)
return f"{settings.SPACES_PUBLIC_URL}/{object_name}"
Comment on lines +19 to +26
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

🧩 Analysis chain

🏁 Script executed:

cat backend/app/services/storage.py

Repository: shubhamxdd/pyq-appl

Length of output: 1487


🏁 Script executed:

rg "put_object|upload|storage" backend/app --type py -l | head -20

Repository: shubhamxdd/pyq-appl

Length of output: 157


🏁 Script executed:

rg "public-read\|ACL" backend --type py -A 3 -B 3

Repository: shubhamxdd/pyq-appl

Length of output: 45


🏁 Script executed:

cat backend/app/routers/resources.py

Repository: shubhamxdd/pyq-appl

Length of output: 4101


🏁 Script executed:

cat backend/app/workers/tasks.py

Repository: shubhamxdd/pyq-appl

Length of output: 6629


Use private ACL with pre-signed URLs instead of public-read for uploaded resources.

Files are stored with ACL='public-read' (line 23), making them directly accessible to anyone with the URL regardless of application authentication. Though the API restricts listing/deletion to authenticated users, the URLs themselves (stored in the database and returned to clients) are not secret—if shared or leaked, anyone can access the file without authentication. This is particularly risky for user-uploaded PDFs and documents that may contain sensitive data.

Refactor to use private ACL and generate pre-signed URLs for temporary, authenticated access. Update the extraction task to use the boto3 client directly for internal file access rather than fetching via HTTP.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/app/services/storage.py` around lines 19 - 26, Change object uploads
to private ACL instead of public-read: in the code path that calls
self.client.put_object (and uses settings.SPACES_BUCKET, Key=object_name), set
ACL='private' (or omit ACL) and stop returning the direct public URL
(settings.SPACES_PUBLIC_URL/...); instead return the object key or a stable
identifier and generate time-limited access via
self.client.generate_presigned_url('get_object', Params={'Bucket':
settings.SPACES_BUCKET, 'Key': object_name}, ExpiresIn=...) when the API needs
to serve a download. Also update the extraction task to read files with the
boto3 client (self.client.get_object / streaming body) using the same
SPACES_BUCKET and object_name rather than fetching the public HTTP URL.

except ClientError as e:
logging.error(f"Error uploading file to DigitalOcean Spaces: {e}")
return None

def delete_file(self, object_name: str):
try:
self.client.delete_object(
Bucket=settings.SPACES_BUCKET,
Key=object_name
)
return True
except ClientError as e:
logging.error(f"Error deleting file from DigitalOcean Spaces: {e}")
return False

storage_service = StorageService()
3 changes: 2 additions & 1 deletion backend/app/workers/arq_worker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
from arq.connections import RedisSettings
from app.config import settings
from .tasks import extraction_task

async def ping(ctx):
return "pong"
Expand All @@ -12,7 +13,7 @@ async def shutdown(ctx):
pass

class WorkerSettings:
functions = [ping]
functions = [ping, extraction_task]
on_startup = startup
on_shutdown = shutdown
redis_settings = RedisSettings.from_dsn(settings.REDIS_URL)
Loading