""" Flask API routes aligned with TitanForge/schema.sql. Includes auth, upload+transcription, history, and RAG search workflow. """ import hashlib import io import json import os import uuid import re import zipfile from pathlib import Path from typing import Any, List from dotenv import load_dotenv from faster_whisper import WhisperModel from flask import Blueprint, jsonify, request, send_file from werkzeug.security import check_password_hash, generate_password_hash from werkzeug.utils import secure_filename from db_queries import ( add_archive_file, add_audit_log, add_rag_chunks, create_audio_post, create_user, download_storage_object_by_stored_path, get_archive_file_by_role, get_archive_metadata, get_original_audio_url, get_archive_rights, get_audio_post_by_id, get_post_bundle, get_user_by_email, get_user_by_id, list_archive_files, list_audio_posts, list_audit_logs, list_rag_chunks, list_user_history, search_rag_chunks, search_rag_chunks_vector, update_audio_post, upload_storage_object, upsert_archive_metadata, upsert_archive_rights, ) load_dotenv() api = Blueprint("api", __name__, url_prefix="/api") UPLOAD_DIR = Path(os.getenv("BACKEND_UPLOAD_DIR", "uploads")) UPLOAD_DIR.mkdir(parents=True, exist_ok=True) ALLOWED_MEDIA_EXTENSIONS = {"mp4", "mov", "mkv", "webm", "m4a", "mp3", "wav", "ogg", "flac"} WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base") WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu") WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8") ARCHIVE_BUCKET = os.getenv("SUPABASE_BUCKET", os.getenv("SUPABASE_ARCHIVE_BUCKET", "archives")) _whisper_model: WhisperModel | None = None def _model() -> WhisperModel: global _whisper_model if _whisper_model is None: _whisper_model = WhisperModel( WHISPER_MODEL, device=WHISPER_DEVICE, compute_type=WHISPER_COMPUTE_TYPE, ) return _whisper_model def _error(message: str, status: int = 400): return jsonify({"error": message}), status def _allowed_file(filename: str) -> bool: if "." not in filename: return False return filename.rsplit(".", 1)[1].lower() in ALLOWED_MEDIA_EXTENSIONS def _sha256(path: Path) -> str: h = hashlib.sha256() with path.open("rb") as f: while True: chunk = f.read(1024 * 1024) if not chunk: break h.update(chunk) return h.hexdigest() def _build_prompt(transcript_text: str, title: str) -> str: return ( "You are an archive assistant. Use the following transcribed audio as source context. " f"Post title: {title}.\n\n" "Transcript:\n" f"{transcript_text}\n\n" "Answer user questions grounded in this transcript." ) def _add_audio_url(post: dict[str, Any]) -> dict[str, Any]: """Add signed audio URL to post if ready""" if post.get("status") == "ready": try: audio_data = get_original_audio_url(post["post_id"], expires_in=3600) post["audio_url"] = audio_data["signed_url"] except: pass return post def _local_embedding(text: str, dimensions: int = 1536) -> List[float]: """ Free deterministic embedding fallback (offline). Replace with model-based embeddings later if needed. """ vector = [0.0] * dimensions tokens = re.findall(r"[A-Za-z0-9']+", text.lower()) if not tokens: return vector for token in tokens: digest = hashlib.sha256(token.encode("utf-8")).digest() idx = int.from_bytes(digest[:4], "big") % dimensions sign = 1.0 if (digest[4] & 1) == 0 else -1.0 weight = 1.0 + (digest[5] / 255.0) * 0.25 vector[idx] += sign * weight norm = sum(v * v for v in vector) ** 0.5 if norm > 0: vector = [v / norm for v in vector] return vector @api.get("/health") def health(): return jsonify({ "status": "ok", "whisper_model": WHISPER_MODEL, "whisper_device": WHISPER_DEVICE, "whisper_compute_type": WHISPER_COMPUTE_TYPE, }) # ==================== Auth ==================== @api.post("/auth/register") def api_register(): payload = request.get_json(force=True, silent=False) or {} email = (payload.get("email") or "").strip().lower() password = payload.get("password") or "" if not email or not password: return _error("'email' and 'password' are required.", 400) existing = get_user_by_email(email) if existing: return _error("User already exists for this email.", 409) try: user = create_user( { "email": email, "password_hash": generate_password_hash(password), "display_name": payload.get("display_name"), "avatar_url": payload.get("avatar_url"), "bio": payload.get("bio"), } ) add_audit_log({"user_id": user["user_id"], "action": "user.register", "details": json.dumps({"email": email})}) return jsonify({ "user": { "user_id": user["user_id"], "email": user["email"], "display_name": user.get("display_name"), } }), 201 except Exception as e: return _error(str(e), 500) @api.post("/auth/login") def api_login(): payload = request.get_json(force=True, silent=False) or {} email = (payload.get("email") or "").strip().lower() password = payload.get("password") or "" if not email or not password: return _error("'email' and 'password' are required.", 400) user = get_user_by_email(email) if not user: return _error("Invalid credentials.", 401) if not check_password_hash(user["password_hash"], password): return _error("Invalid credentials.", 401) add_audit_log({"user_id": user["user_id"], "action": "user.login", "details": json.dumps({"email": email})}) return jsonify( { "user": { "user_id": user["user_id"], "email": user["email"], "display_name": user.get("display_name"), "avatar_url": user.get("avatar_url"), "bio": user.get("bio"), } } ) # ==================== Upload + Prompt ==================== @api.post("/posts/upload") def api_upload_post(): if "file" not in request.files: return _error("Missing 'file' in form-data.", 400) media = request.files["file"] if not media.filename: return _error("Filename is empty.", 400) if not _allowed_file(media.filename): return _error("Unsupported media extension.", 400) user_id_raw = request.form.get("user_id") title = (request.form.get("title") or "Untitled recording").strip() description = request.form.get("description") visibility = (request.form.get("visibility") or "private").strip().lower() language = (request.form.get("language") or "en").strip().lower() if visibility not in {"private", "public"}: return _error("'visibility' must be 'private' or 'public'.", 400) try: user_id = int(user_id_raw) except (TypeError, ValueError): return _error("'user_id' is required and must be an integer.", 400) user = get_user_by_id(user_id) if not user: return _error("User not found.", 404) post_uuid = str(uuid.uuid4()) safe_name = secure_filename(media.filename) storage_prefix = f"archive/{user_id}/{post_uuid}" storage_object_path = f"{user_id}/{post_uuid}/original/{safe_name}" saved_path = UPLOAD_DIR / f"{post_uuid}_{safe_name}" media.save(saved_path) created_post = None try: created_post = create_audio_post( { "user_id": user_id, "title": title, "description": description, "visibility": visibility, "status": "processing", "language": language, "storage_prefix": storage_prefix, } ) post_id = int(created_post["post_id"]) media_sha = _sha256(saved_path) with saved_path.open("rb") as media_file: upload_storage_object( bucket=ARCHIVE_BUCKET, object_path=storage_object_path, content=media_file.read(), content_type=media.mimetype or "application/octet-stream", upsert=False, ) add_archive_file( post_id, { "role": "original_audio", "path": f"{ARCHIVE_BUCKET}/{storage_object_path}", "content_type": media.mimetype, "size_bytes": saved_path.stat().st_size, "sha256": media_sha, }, ) segments, _info = _model().transcribe(str(saved_path)) rag_rows = [] transcript_parts = [] for seg in segments: segment_text = seg.text.strip() if not segment_text: continue transcript_parts.append(segment_text) rag_rows.append( { "start_sec": float(seg.start), "end_sec": float(seg.end), "text": segment_text, "confidence": float(seg.avg_logprob) if seg.avg_logprob is not None else None, "embedding": _local_embedding(segment_text), } ) transcript_text = " ".join(transcript_parts).strip() prompt_text = _build_prompt(transcript_text, title) if rag_rows: add_rag_chunks(post_id, rag_rows) upsert_archive_metadata( post_id, json.dumps( { "prompt": prompt_text, "transcript_length_chars": len(transcript_text), "source_file": safe_name, "language": language, } ), ) add_archive_file( post_id, { "role": "transcript_txt", "path": f"{storage_prefix}/transcript.txt", "content_type": "text/plain", "size_bytes": len(transcript_text.encode("utf-8")), "sha256": hashlib.sha256(transcript_text.encode("utf-8")).hexdigest(), }, ) update_audio_post(post_id, {"status": "ready"}) add_audit_log( { "post_id": post_id, "user_id": user_id, "action": "post.upload.transcribed", "details": json.dumps({"visibility": visibility, "storage_prefix": storage_prefix}), } ) return jsonify( { "post_id": post_id, "visibility": visibility, "status": "ready", "audio_path": f"{ARCHIVE_BUCKET}/{storage_object_path}", "transcript_text": transcript_text, "prompt": prompt_text, "rag_chunk_count": len(rag_rows), } ), 201 except Exception as e: if created_post and created_post.get("post_id"): update_audio_post(int(created_post["post_id"]), {"status": "failed"}) add_audit_log( { "post_id": int(created_post["post_id"]), "user_id": user_id, "action": "post.upload.failed", "details": json.dumps({"error": str(e)}), } ) return _error(f"Upload/transcription failed: {e}", 500) # ==================== History + RAG Search ==================== @api.get("/users//history") def api_user_history(user_id: int): page = request.args.get("page", default=1, type=int) limit = request.args.get("limit", default=20, type=int) try: posts = list_user_history(user_id, page=page, limit=limit) return jsonify({"history": posts, "page": page, "limit": min(max(1, limit), 100)}) except Exception as e: return _error(str(e), 500) @api.get("/rag/search") def api_rag_search(): query_text = (request.args.get("q") or "").strip() user_id = request.args.get("user_id", type=int) query_embedding_raw = request.args.get("query_embedding") page = request.args.get("page", default=1, type=int) limit = request.args.get("limit", default=30, type=int) if not user_id: return _error("'user_id' is required.", 400) try: if query_embedding_raw: try: parsed = json.loads(query_embedding_raw) if not isinstance(parsed, list): return _error("'query_embedding' must be a JSON array.", 400) query_embedding = [float(v) for v in parsed] except Exception: return _error("Invalid 'query_embedding'. Example: [0.1,0.2,...]", 400) rows = search_rag_chunks_vector(user_id=user_id, query_embedding=query_embedding, limit=limit) return jsonify({"results": rows, "mode": "vector", "limit": min(max(1, limit), 100)}) if not query_text: return _error("'q' is required when 'query_embedding' is not provided.", 400) rows = search_rag_chunks(user_id=user_id, query_text=query_text, page=page, limit=limit) return jsonify({"results": rows, "mode": "text", "page": page, "limit": min(max(1, limit), 100)}) except Exception as e: return _error(str(e), 500) # ==================== Existing CRUD Routes ==================== @api.post("/users") def api_create_user(): payload = request.get_json(force=True, silent=False) or {} try: return jsonify(create_user(payload)), 201 except ValueError as e: return _error(str(e), 400) except Exception as e: return _error(str(e), 500) @api.get("/users/") def api_get_user(user_id: int): user = get_user_by_id(user_id) if not user: return _error("User not found.", 404) return jsonify(user) @api.post("/posts") def api_create_post(): payload = request.get_json(force=True, silent=False) or {} try: return jsonify(create_audio_post(payload)), 201 except ValueError as e: return _error(str(e), 400) except Exception as e: return _error(str(e), 500) @api.get("/posts") def api_list_posts(): page = request.args.get("page", default=1, type=int) limit = request.args.get("limit", default=20, type=int) visibility = request.args.get("visibility") current_user_id = request.args.get("current_user_id", type=int) # NEW LINE try: rows = list_audio_posts(page=page, limit=limit, visibility=visibility) # NEW: Filter private posts if current_user_id: rows = [p for p in rows if p.get('visibility') == 'public' or p.get('user_id') == current_user_id] else: rows = [p for p in rows if p.get('visibility') == 'public'] # NEW: Add audio URLs - CHANGE THIS LINE ONLY rows = [_add_audio_url(post) for post in rows] return jsonify({"posts": rows, "page": page, "limit": min(max(1, limit), 100)}) except Exception as e: return _error(str(e), 500) @api.get("/posts/") def api_get_post(post_id: int): row = get_audio_post_by_id(post_id) if not row: return _error("Post not found.", 404) return jsonify(row) @api.patch("/posts/") def api_patch_post(post_id: int): payload = request.get_json(force=True, silent=False) or {} try: row = update_audio_post(post_id, payload) if not row: return _error("Post not found.", 404) return jsonify(row) except Exception as e: return _error(str(e), 500) @api.get("/posts//bundle") def api_post_bundle(post_id: int): bundle = get_post_bundle(post_id) if not bundle: return _error("Post not found.", 404) return jsonify(bundle) @api.get("/posts//audio-url") def api_post_audio_url(post_id: int): """ Get signed URL for original audio/video so users can play it. Private posts require owner user_id in query params. """ row = get_audio_post_by_id(post_id) if not row: return _error("Post not found.", 404) visibility = row.get("visibility") owner_id = row.get("user_id") requester_id = request.args.get("user_id", type=int) expires_in = request.args.get("expires_in", default=3600, type=int) expires_in = min(max(60, expires_in), 86400) if visibility == "private" and requester_id != owner_id: return _error("Not authorized to access this private audio.", 403) try: result = get_original_audio_url(post_id=post_id, expires_in=expires_in) return jsonify(result) except ValueError as e: return _error(str(e), 404) except Exception as e: return _error(str(e), 500) @api.get("/posts//archive.zip") def api_post_archive_zip(post_id: int): """ Download a complete archive zip for a post. Private posts require owner user_id in query params. """ row = get_audio_post_by_id(post_id) if not row: return _error("Post not found.", 404) visibility = row.get("visibility") owner_id = row.get("user_id") requester_id = request.args.get("user_id", type=int) if visibility == "private" and requester_id != owner_id: return _error("Not authorized to download this private archive.", 403) try: bundle = get_post_bundle(post_id) if not bundle: return _error("Post bundle not found.", 404) archive_buf = io.BytesIO() with zipfile.ZipFile(archive_buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: # Always include core JSON artifacts. zf.writestr("post.json", json.dumps(bundle.get("post", {}), indent=2, default=str)) zf.writestr("metadata.json", json.dumps(bundle.get("metadata", {}), indent=2, default=str)) zf.writestr("rights.json", json.dumps(bundle.get("rights", {}), indent=2, default=str)) zf.writestr("rag_chunks.json", json.dumps(bundle.get("rag_chunks", []), indent=2, default=str)) zf.writestr("audit_log.json", json.dumps(bundle.get("audit_log", []), indent=2, default=str)) # Derive transcript from rag chunks. chunks = bundle.get("rag_chunks", []) or [] transcript_text = " ".join( (chunk.get("text") or "").strip() for chunk in chunks if chunk.get("text") ).strip() if transcript_text: zf.writestr("transcript.txt", transcript_text) # Include original media from Supabase Storage if present. original_file = get_archive_file_by_role(post_id, "original_audio") if original_file and original_file.get("path"): original_bytes = download_storage_object_by_stored_path(original_file["path"]) source_name = original_file["path"].split("/")[-1] or f"post_{post_id}_original.bin" zf.writestr(f"original/{source_name}", original_bytes) archive_buf.seek(0) download_name = f"voicevault_post_{post_id}_archive.zip" return send_file( archive_buf, mimetype="application/zip", as_attachment=True, download_name=download_name, ) except Exception as e: return _error(f"Failed to build archive zip: {e}", 500) @api.post("/posts//files") def api_add_file(post_id: int): payload = request.get_json(force=True, silent=False) or {} try: return jsonify(add_archive_file(post_id, payload)), 201 except ValueError as e: return _error(str(e), 400) except Exception as e: return _error(str(e), 500) @api.get("/posts//files") def api_list_files(post_id: int): try: return jsonify({"files": list_archive_files(post_id)}) except Exception as e: return _error(str(e), 500) @api.put("/posts//metadata") def api_put_metadata(post_id: int): payload = request.get_json(force=True, silent=False) or {} metadata = payload.get("metadata") if metadata is None: return _error("'metadata' is required.", 400) try: return jsonify(upsert_archive_metadata(post_id, metadata)) except Exception as e: return _error(str(e), 500) @api.get("/posts//metadata") def api_get_metadata(post_id: int): row = get_archive_metadata(post_id) if not row: return _error("Metadata not found.", 404) return jsonify(row) @api.put("/posts//rights") def api_put_rights(post_id: int): payload = request.get_json(force=True, silent=False) or {} try: return jsonify(upsert_archive_rights(post_id, payload)) except Exception as e: return _error(str(e), 500) @api.get("/posts//rights") def api_get_rights(post_id: int): row = get_archive_rights(post_id) if not row: return _error("Rights not found.", 404) return jsonify(row) @api.post("/posts//chunks") def api_add_chunks(post_id: int): payload = request.get_json(force=True, silent=False) or {} chunks = payload.get("chunks") if not isinstance(chunks, list): return _error("'chunks' must be a list.", 400) try: rows = add_rag_chunks(post_id, chunks) return jsonify({"inserted": len(rows), "chunks": rows}), 201 except Exception as e: return _error(str(e), 500) @api.get("/posts//chunks") def api_get_chunks(post_id: int): page = request.args.get("page", default=1, type=int) limit = request.args.get("limit", default=200, type=int) try: return jsonify({"chunks": list_rag_chunks(post_id, page=page, limit=limit)}) except Exception as e: return _error(str(e), 500) @api.post("/audit") def api_create_audit(): payload = request.get_json(force=True, silent=False) or {} try: return jsonify(add_audit_log(payload)), 201 except ValueError as e: return _error(str(e), 400) except Exception as e: return _error(str(e), 500) @api.get("/audit") def api_list_audit(): post_id = request.args.get("post_id", type=int) user_id = request.args.get("user_id", type=int) page = request.args.get("page", default=1, type=int) limit = request.args.get("limit", default=100, type=int) try: return jsonify({"logs": list_audit_logs(post_id=post_id, user_id=user_id, page=page, limit=limit)}) except Exception as e: return _error(str(e), 500) @api.get("/posts//audit") def api_post_audit(post_id: int): page = request.args.get("page", default=1, type=int) limit = request.args.get("limit", default=100, type=int) try: return jsonify({"logs": list_audit_logs(post_id=post_id, page=page, limit=limit)}) except Exception as e: return _error(str(e), 500)