diff --git a/backend/__pycache__/api_routes.cpython-311.pyc b/backend/__pycache__/api_routes.cpython-311.pyc index 7fd670c..c6c5b4f 100644 Binary files a/backend/__pycache__/api_routes.cpython-311.pyc and b/backend/__pycache__/api_routes.cpython-311.pyc differ diff --git a/backend/__pycache__/db_queries.cpython-311.pyc b/backend/__pycache__/db_queries.cpython-311.pyc index 12f2702..690ec4a 100644 Binary files a/backend/__pycache__/db_queries.cpython-311.pyc and b/backend/__pycache__/db_queries.cpython-311.pyc differ diff --git a/backend/api_routes.py b/backend/api_routes.py index ad8ad67..5a75fd7 100644 --- a/backend/api_routes.py +++ b/backend/api_routes.py @@ -1,8 +1,19 @@ """ Flask API routes aligned with TitanForge/schema.sql. +Includes auth, upload+transcription, history, and RAG search workflow. """ +import hashlib +import json +import os +import uuid +from pathlib import Path + +from dotenv import load_dotenv +from faster_whisper import WhisperModel from flask import Blueprint, jsonify, request +from werkzeug.security import check_password_hash, generate_password_hash +from werkzeug.utils import secure_filename from db_queries import ( add_archive_file, @@ -14,29 +25,335 @@ from db_queries import ( get_archive_rights, get_audio_post_by_id, get_post_bundle, + get_user_by_email, get_user_by_id, list_archive_files, list_audio_posts, list_audit_logs, list_rag_chunks, + list_user_history, + search_rag_chunks, update_audio_post, upsert_archive_metadata, upsert_archive_rights, ) +load_dotenv() + api = Blueprint("api", __name__, url_prefix="/api") +UPLOAD_DIR = Path(os.getenv("BACKEND_UPLOAD_DIR", "uploads")) +UPLOAD_DIR.mkdir(parents=True, exist_ok=True) + +ALLOWED_MEDIA_EXTENSIONS = {"mp4", "mov", "mkv", "webm", "m4a", "mp3", "wav", "ogg", "flac"} +WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base") +WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu") +WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8") + +_whisper_model: WhisperModel | None = None + + +def _model() -> WhisperModel: + global _whisper_model + if _whisper_model is None: + _whisper_model = WhisperModel( + WHISPER_MODEL, + device=WHISPER_DEVICE, + compute_type=WHISPER_COMPUTE_TYPE, + ) + return _whisper_model + def _error(message: str, status: int = 400): return jsonify({"error": message}), status +def _allowed_file(filename: str) -> bool: + if "." not in filename: + return False + return filename.rsplit(".", 1)[1].lower() in ALLOWED_MEDIA_EXTENSIONS + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + while True: + chunk = f.read(1024 * 1024) + if not chunk: + break + h.update(chunk) + return h.hexdigest() + + +def _build_prompt(transcript_text: str, title: str) -> str: + return ( + "You are an archive assistant. Use the following transcribed audio as source context. " + f"Post title: {title}.\n\n" + "Transcript:\n" + f"{transcript_text}\n\n" + "Answer user questions grounded in this transcript." + ) + + @api.get("/health") def health(): - return jsonify({"status": "ok"}) + return jsonify({ + "status": "ok", + "whisper_model": WHISPER_MODEL, + "whisper_device": WHISPER_DEVICE, + "whisper_compute_type": WHISPER_COMPUTE_TYPE, + }) -# ==================== Users ==================== +# ==================== Auth ==================== + +@api.post("/auth/register") +def api_register(): + payload = request.get_json(force=True, silent=False) or {} + email = (payload.get("email") or "").strip().lower() + password = payload.get("password") or "" + + if not email or not password: + return _error("'email' and 'password' are required.", 400) + + existing = get_user_by_email(email) + if existing: + return _error("User already exists for this email.", 409) + + try: + user = create_user( + { + "email": email, + "password_hash": generate_password_hash(password), + "display_name": payload.get("display_name"), + "avatar_url": payload.get("avatar_url"), + "bio": payload.get("bio"), + } + ) + add_audit_log({"user_id": user["user_id"], "action": "user.register", "details": json.dumps({"email": email})}) + return jsonify({ + "user": { + "user_id": user["user_id"], + "email": user["email"], + "display_name": user.get("display_name"), + } + }), 201 + except Exception as e: + return _error(str(e), 500) + + +@api.post("/auth/login") +def api_login(): + payload = request.get_json(force=True, silent=False) or {} + email = (payload.get("email") or "").strip().lower() + password = payload.get("password") or "" + + if not email or not password: + return _error("'email' and 'password' are required.", 400) + + user = get_user_by_email(email) + if not user: + return _error("Invalid credentials.", 401) + + if not check_password_hash(user["password_hash"], password): + return _error("Invalid credentials.", 401) + + add_audit_log({"user_id": user["user_id"], "action": "user.login", "details": json.dumps({"email": email})}) + + return jsonify( + { + "user": { + "user_id": user["user_id"], + "email": user["email"], + "display_name": user.get("display_name"), + "avatar_url": user.get("avatar_url"), + "bio": user.get("bio"), + } + } + ) + + +# ==================== Upload + Prompt ==================== + +@api.post("/posts/upload") +def api_upload_post(): + if "file" not in request.files: + return _error("Missing 'file' in form-data.", 400) + + media = request.files["file"] + if not media.filename: + return _error("Filename is empty.", 400) + if not _allowed_file(media.filename): + return _error("Unsupported media extension.", 400) + + user_id_raw = request.form.get("user_id") + title = (request.form.get("title") or "Untitled recording").strip() + description = request.form.get("description") + visibility = (request.form.get("visibility") or "private").strip().lower() + language = (request.form.get("language") or "en").strip().lower() + + if visibility not in {"private", "public"}: + return _error("'visibility' must be 'private' or 'public'.", 400) + + try: + user_id = int(user_id_raw) + except (TypeError, ValueError): + return _error("'user_id' is required and must be an integer.", 400) + + user = get_user_by_id(user_id) + if not user: + return _error("User not found.", 404) + + post_uuid = str(uuid.uuid4()) + safe_name = secure_filename(media.filename) + storage_prefix = f"archive/{user_id}/{post_uuid}" + saved_path = UPLOAD_DIR / f"{post_uuid}_{safe_name}" + media.save(saved_path) + + created_post = None + try: + created_post = create_audio_post( + { + "user_id": user_id, + "title": title, + "description": description, + "visibility": visibility, + "status": "processing", + "language": language, + "storage_prefix": storage_prefix, + } + ) + + post_id = int(created_post["post_id"]) + media_sha = _sha256(saved_path) + + add_archive_file( + post_id, + { + "role": "original_audio", + "path": str(saved_path).replace("\\", "/"), + "content_type": media.mimetype, + "size_bytes": saved_path.stat().st_size, + "sha256": media_sha, + }, + ) + + segments, _info = _model().transcribe(str(saved_path)) + rag_rows = [] + transcript_parts = [] + for seg in segments: + segment_text = seg.text.strip() + if not segment_text: + continue + transcript_parts.append(segment_text) + rag_rows.append( + { + "start_sec": float(seg.start), + "end_sec": float(seg.end), + "text": segment_text, + "confidence": float(seg.avg_logprob) if seg.avg_logprob is not None else None, + "embedding": None, + } + ) + + transcript_text = " ".join(transcript_parts).strip() + prompt_text = _build_prompt(transcript_text, title) + + if rag_rows: + add_rag_chunks(post_id, rag_rows) + + upsert_archive_metadata( + post_id, + json.dumps( + { + "prompt": prompt_text, + "transcript_length_chars": len(transcript_text), + "source_file": safe_name, + "language": language, + } + ), + ) + + add_archive_file( + post_id, + { + "role": "transcript_txt", + "path": f"{storage_prefix}/transcript.txt", + "content_type": "text/plain", + "size_bytes": len(transcript_text.encode("utf-8")), + "sha256": hashlib.sha256(transcript_text.encode("utf-8")).hexdigest(), + }, + ) + + update_audio_post(post_id, {"status": "ready"}) + add_audit_log( + { + "post_id": post_id, + "user_id": user_id, + "action": "post.upload.transcribed", + "details": json.dumps({"visibility": visibility, "storage_prefix": storage_prefix}), + } + ) + + return jsonify( + { + "post_id": post_id, + "visibility": visibility, + "status": "ready", + "audio_path": str(saved_path).replace("\\", "/"), + "transcript_text": transcript_text, + "prompt": prompt_text, + "rag_chunk_count": len(rag_rows), + } + ), 201 + except Exception as e: + if created_post and created_post.get("post_id"): + update_audio_post(int(created_post["post_id"]), {"status": "failed"}) + add_audit_log( + { + "post_id": int(created_post["post_id"]), + "user_id": user_id, + "action": "post.upload.failed", + "details": json.dumps({"error": str(e)}), + } + ) + return _error(f"Upload/transcription failed: {e}", 500) + + +# ==================== History + RAG Search ==================== + +@api.get("/users//history") +def api_user_history(user_id: int): + page = request.args.get("page", default=1, type=int) + limit = request.args.get("limit", default=20, type=int) + + try: + posts = list_user_history(user_id, page=page, limit=limit) + return jsonify({"history": posts, "page": page, "limit": min(max(1, limit), 100)}) + except Exception as e: + return _error(str(e), 500) + + +@api.get("/rag/search") +def api_rag_search(): + query_text = (request.args.get("q") or "").strip() + user_id = request.args.get("user_id", type=int) + page = request.args.get("page", default=1, type=int) + limit = request.args.get("limit", default=30, type=int) + + if not user_id: + return _error("'user_id' is required.", 400) + if not query_text: + return _error("'q' is required.", 400) + + try: + rows = search_rag_chunks(user_id=user_id, query_text=query_text, page=page, limit=limit) + return jsonify({"results": rows, "page": page, "limit": min(max(1, limit), 100)}) + except Exception as e: + return _error(str(e), 500) + + +# ==================== Existing CRUD Routes ==================== @api.post("/users") def api_create_user(): @@ -57,8 +374,6 @@ def api_get_user(user_id: int): return jsonify(user) -# ==================== Audio Posts ==================== - @api.post("/posts") def api_create_post(): payload = request.get_json(force=True, silent=False) or {} @@ -112,8 +427,6 @@ def api_post_bundle(post_id: int): return jsonify(bundle) -# ==================== Archive Files ==================== - @api.post("/posts//files") def api_add_file(post_id: int): payload = request.get_json(force=True, silent=False) or {} @@ -133,8 +446,6 @@ def api_list_files(post_id: int): return _error(str(e), 500) -# ==================== Metadata ==================== - @api.put("/posts//metadata") def api_put_metadata(post_id: int): payload = request.get_json(force=True, silent=False) or {} @@ -156,8 +467,6 @@ def api_get_metadata(post_id: int): return jsonify(row) -# ==================== Rights ==================== - @api.put("/posts//rights") def api_put_rights(post_id: int): payload = request.get_json(force=True, silent=False) or {} @@ -175,8 +484,6 @@ def api_get_rights(post_id: int): return jsonify(row) -# ==================== RAG Chunks ==================== - @api.post("/posts//chunks") def api_add_chunks(post_id: int): payload = request.get_json(force=True, silent=False) or {} @@ -203,8 +510,6 @@ def api_get_chunks(post_id: int): return _error(str(e), 500) -# ==================== Audit Log ==================== - @api.post("/audit") def api_create_audit(): payload = request.get_json(force=True, silent=False) or {} diff --git a/backend/db_queries.py b/backend/db_queries.py index f1138c5..204db1c 100644 --- a/backend/db_queries.py +++ b/backend/db_queries.py @@ -63,6 +63,10 @@ def get_user_by_id(user_id: int) -> Optional[Dict[str, Any]]: return _first(supabase.table("users").select("*").eq("user_id", user_id).limit(1).execute()) +def get_user_by_email(email: str) -> Optional[Dict[str, Any]]: + return _first(supabase.table("users").select("*").eq("email", email).limit(1).execute()) + + # ==================== Audio Posts ==================== def create_audio_post(payload: Dict[str, Any]) -> Dict[str, Any]: @@ -102,6 +106,10 @@ def get_audio_post_by_id(post_id: int) -> Optional[Dict[str, Any]]: return _first(query.execute()) +def list_user_history(user_id: int, page: int = 1, limit: int = 20) -> List[Dict[str, Any]]: + return list_audio_posts(page=page, limit=limit, user_id=user_id) + + def list_audio_posts(page: int = 1, limit: int = 20, visibility: Optional[str] = None, user_id: Optional[int] = None) -> List[Dict[str, Any]]: start, end = _paginate(page, limit) query = supabase.table("audio_posts").select("*, users(user_id, email, display_name, avatar_url)") @@ -267,6 +275,23 @@ def list_rag_chunks(post_id: int, page: int = 1, limit: int = 200) -> List[Dict[ return _rows(response) +def search_rag_chunks(user_id: int, query_text: str, page: int = 1, limit: int = 30) -> List[Dict[str, Any]]: + start, end = _paginate(page, limit) + response = ( + supabase.table("rag_chunks") + .select( + "chunk_id, post_id, start_sec, end_sec, text, confidence, created_at, " + "audio_posts!inner(post_id, user_id, title, visibility, created_at)" + ) + .eq("audio_posts.user_id", user_id) + .ilike("text", f"%{query_text}%") + .order("created_at", desc=True) + .range(start, end) + .execute() + ) + return _rows(response) + + # ==================== Audit Log ==================== def add_audit_log(payload: Dict[str, Any]) -> Dict[str, Any]: diff --git a/backend/uploads/5bd199e8-fa0a-47e3-aca8-0a4e732c0610_data.m4a b/backend/uploads/5bd199e8-fa0a-47e3-aca8-0a4e732c0610_data.m4a new file mode 100644 index 0000000..38e5740 Binary files /dev/null and b/backend/uploads/5bd199e8-fa0a-47e3-aca8-0a4e732c0610_data.m4a differ