rag applications
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -1,8 +1,19 @@
|
||||
"""
|
||||
Flask API routes aligned with TitanForge/schema.sql.
|
||||
Includes auth, upload+transcription, history, and RAG search workflow.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from faster_whisper import WhisperModel
|
||||
from flask import Blueprint, jsonify, request
|
||||
from werkzeug.security import check_password_hash, generate_password_hash
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from db_queries import (
|
||||
add_archive_file,
|
||||
@@ -14,29 +25,335 @@ from db_queries import (
|
||||
get_archive_rights,
|
||||
get_audio_post_by_id,
|
||||
get_post_bundle,
|
||||
get_user_by_email,
|
||||
get_user_by_id,
|
||||
list_archive_files,
|
||||
list_audio_posts,
|
||||
list_audit_logs,
|
||||
list_rag_chunks,
|
||||
list_user_history,
|
||||
search_rag_chunks,
|
||||
update_audio_post,
|
||||
upsert_archive_metadata,
|
||||
upsert_archive_rights,
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
api = Blueprint("api", __name__, url_prefix="/api")
|
||||
|
||||
UPLOAD_DIR = Path(os.getenv("BACKEND_UPLOAD_DIR", "uploads"))
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ALLOWED_MEDIA_EXTENSIONS = {"mp4", "mov", "mkv", "webm", "m4a", "mp3", "wav", "ogg", "flac"}
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
|
||||
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu")
|
||||
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
|
||||
|
||||
_whisper_model: WhisperModel | None = None
|
||||
|
||||
|
||||
def _model() -> WhisperModel:
|
||||
global _whisper_model
|
||||
if _whisper_model is None:
|
||||
_whisper_model = WhisperModel(
|
||||
WHISPER_MODEL,
|
||||
device=WHISPER_DEVICE,
|
||||
compute_type=WHISPER_COMPUTE_TYPE,
|
||||
)
|
||||
return _whisper_model
|
||||
|
||||
|
||||
def _error(message: str, status: int = 400):
|
||||
return jsonify({"error": message}), status
|
||||
|
||||
|
||||
def _allowed_file(filename: str) -> bool:
|
||||
if "." not in filename:
|
||||
return False
|
||||
return filename.rsplit(".", 1)[1].lower() in ALLOWED_MEDIA_EXTENSIONS
|
||||
|
||||
|
||||
def _sha256(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while True:
|
||||
chunk = f.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _build_prompt(transcript_text: str, title: str) -> str:
|
||||
return (
|
||||
"You are an archive assistant. Use the following transcribed audio as source context. "
|
||||
f"Post title: {title}.\n\n"
|
||||
"Transcript:\n"
|
||||
f"{transcript_text}\n\n"
|
||||
"Answer user questions grounded in this transcript."
|
||||
)
|
||||
|
||||
|
||||
@api.get("/health")
|
||||
def health():
|
||||
return jsonify({"status": "ok"})
|
||||
return jsonify({
|
||||
"status": "ok",
|
||||
"whisper_model": WHISPER_MODEL,
|
||||
"whisper_device": WHISPER_DEVICE,
|
||||
"whisper_compute_type": WHISPER_COMPUTE_TYPE,
|
||||
})
|
||||
|
||||
|
||||
# ==================== Users ====================
|
||||
# ==================== Auth ====================
|
||||
|
||||
@api.post("/auth/register")
|
||||
def api_register():
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
email = (payload.get("email") or "").strip().lower()
|
||||
password = payload.get("password") or ""
|
||||
|
||||
if not email or not password:
|
||||
return _error("'email' and 'password' are required.", 400)
|
||||
|
||||
existing = get_user_by_email(email)
|
||||
if existing:
|
||||
return _error("User already exists for this email.", 409)
|
||||
|
||||
try:
|
||||
user = create_user(
|
||||
{
|
||||
"email": email,
|
||||
"password_hash": generate_password_hash(password),
|
||||
"display_name": payload.get("display_name"),
|
||||
"avatar_url": payload.get("avatar_url"),
|
||||
"bio": payload.get("bio"),
|
||||
}
|
||||
)
|
||||
add_audit_log({"user_id": user["user_id"], "action": "user.register", "details": json.dumps({"email": email})})
|
||||
return jsonify({
|
||||
"user": {
|
||||
"user_id": user["user_id"],
|
||||
"email": user["email"],
|
||||
"display_name": user.get("display_name"),
|
||||
}
|
||||
}), 201
|
||||
except Exception as e:
|
||||
return _error(str(e), 500)
|
||||
|
||||
|
||||
@api.post("/auth/login")
|
||||
def api_login():
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
email = (payload.get("email") or "").strip().lower()
|
||||
password = payload.get("password") or ""
|
||||
|
||||
if not email or not password:
|
||||
return _error("'email' and 'password' are required.", 400)
|
||||
|
||||
user = get_user_by_email(email)
|
||||
if not user:
|
||||
return _error("Invalid credentials.", 401)
|
||||
|
||||
if not check_password_hash(user["password_hash"], password):
|
||||
return _error("Invalid credentials.", 401)
|
||||
|
||||
add_audit_log({"user_id": user["user_id"], "action": "user.login", "details": json.dumps({"email": email})})
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"user": {
|
||||
"user_id": user["user_id"],
|
||||
"email": user["email"],
|
||||
"display_name": user.get("display_name"),
|
||||
"avatar_url": user.get("avatar_url"),
|
||||
"bio": user.get("bio"),
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# ==================== Upload + Prompt ====================
|
||||
|
||||
@api.post("/posts/upload")
|
||||
def api_upload_post():
|
||||
if "file" not in request.files:
|
||||
return _error("Missing 'file' in form-data.", 400)
|
||||
|
||||
media = request.files["file"]
|
||||
if not media.filename:
|
||||
return _error("Filename is empty.", 400)
|
||||
if not _allowed_file(media.filename):
|
||||
return _error("Unsupported media extension.", 400)
|
||||
|
||||
user_id_raw = request.form.get("user_id")
|
||||
title = (request.form.get("title") or "Untitled recording").strip()
|
||||
description = request.form.get("description")
|
||||
visibility = (request.form.get("visibility") or "private").strip().lower()
|
||||
language = (request.form.get("language") or "en").strip().lower()
|
||||
|
||||
if visibility not in {"private", "public"}:
|
||||
return _error("'visibility' must be 'private' or 'public'.", 400)
|
||||
|
||||
try:
|
||||
user_id = int(user_id_raw)
|
||||
except (TypeError, ValueError):
|
||||
return _error("'user_id' is required and must be an integer.", 400)
|
||||
|
||||
user = get_user_by_id(user_id)
|
||||
if not user:
|
||||
return _error("User not found.", 404)
|
||||
|
||||
post_uuid = str(uuid.uuid4())
|
||||
safe_name = secure_filename(media.filename)
|
||||
storage_prefix = f"archive/{user_id}/{post_uuid}"
|
||||
saved_path = UPLOAD_DIR / f"{post_uuid}_{safe_name}"
|
||||
media.save(saved_path)
|
||||
|
||||
created_post = None
|
||||
try:
|
||||
created_post = create_audio_post(
|
||||
{
|
||||
"user_id": user_id,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"visibility": visibility,
|
||||
"status": "processing",
|
||||
"language": language,
|
||||
"storage_prefix": storage_prefix,
|
||||
}
|
||||
)
|
||||
|
||||
post_id = int(created_post["post_id"])
|
||||
media_sha = _sha256(saved_path)
|
||||
|
||||
add_archive_file(
|
||||
post_id,
|
||||
{
|
||||
"role": "original_audio",
|
||||
"path": str(saved_path).replace("\\", "/"),
|
||||
"content_type": media.mimetype,
|
||||
"size_bytes": saved_path.stat().st_size,
|
||||
"sha256": media_sha,
|
||||
},
|
||||
)
|
||||
|
||||
segments, _info = _model().transcribe(str(saved_path))
|
||||
rag_rows = []
|
||||
transcript_parts = []
|
||||
for seg in segments:
|
||||
segment_text = seg.text.strip()
|
||||
if not segment_text:
|
||||
continue
|
||||
transcript_parts.append(segment_text)
|
||||
rag_rows.append(
|
||||
{
|
||||
"start_sec": float(seg.start),
|
||||
"end_sec": float(seg.end),
|
||||
"text": segment_text,
|
||||
"confidence": float(seg.avg_logprob) if seg.avg_logprob is not None else None,
|
||||
"embedding": None,
|
||||
}
|
||||
)
|
||||
|
||||
transcript_text = " ".join(transcript_parts).strip()
|
||||
prompt_text = _build_prompt(transcript_text, title)
|
||||
|
||||
if rag_rows:
|
||||
add_rag_chunks(post_id, rag_rows)
|
||||
|
||||
upsert_archive_metadata(
|
||||
post_id,
|
||||
json.dumps(
|
||||
{
|
||||
"prompt": prompt_text,
|
||||
"transcript_length_chars": len(transcript_text),
|
||||
"source_file": safe_name,
|
||||
"language": language,
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
add_archive_file(
|
||||
post_id,
|
||||
{
|
||||
"role": "transcript_txt",
|
||||
"path": f"{storage_prefix}/transcript.txt",
|
||||
"content_type": "text/plain",
|
||||
"size_bytes": len(transcript_text.encode("utf-8")),
|
||||
"sha256": hashlib.sha256(transcript_text.encode("utf-8")).hexdigest(),
|
||||
},
|
||||
)
|
||||
|
||||
update_audio_post(post_id, {"status": "ready"})
|
||||
add_audit_log(
|
||||
{
|
||||
"post_id": post_id,
|
||||
"user_id": user_id,
|
||||
"action": "post.upload.transcribed",
|
||||
"details": json.dumps({"visibility": visibility, "storage_prefix": storage_prefix}),
|
||||
}
|
||||
)
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"post_id": post_id,
|
||||
"visibility": visibility,
|
||||
"status": "ready",
|
||||
"audio_path": str(saved_path).replace("\\", "/"),
|
||||
"transcript_text": transcript_text,
|
||||
"prompt": prompt_text,
|
||||
"rag_chunk_count": len(rag_rows),
|
||||
}
|
||||
), 201
|
||||
except Exception as e:
|
||||
if created_post and created_post.get("post_id"):
|
||||
update_audio_post(int(created_post["post_id"]), {"status": "failed"})
|
||||
add_audit_log(
|
||||
{
|
||||
"post_id": int(created_post["post_id"]),
|
||||
"user_id": user_id,
|
||||
"action": "post.upload.failed",
|
||||
"details": json.dumps({"error": str(e)}),
|
||||
}
|
||||
)
|
||||
return _error(f"Upload/transcription failed: {e}", 500)
|
||||
|
||||
|
||||
# ==================== History + RAG Search ====================
|
||||
|
||||
@api.get("/users/<int:user_id>/history")
|
||||
def api_user_history(user_id: int):
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
|
||||
try:
|
||||
posts = list_user_history(user_id, page=page, limit=limit)
|
||||
return jsonify({"history": posts, "page": page, "limit": min(max(1, limit), 100)})
|
||||
except Exception as e:
|
||||
return _error(str(e), 500)
|
||||
|
||||
|
||||
@api.get("/rag/search")
|
||||
def api_rag_search():
|
||||
query_text = (request.args.get("q") or "").strip()
|
||||
user_id = request.args.get("user_id", type=int)
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=30, type=int)
|
||||
|
||||
if not user_id:
|
||||
return _error("'user_id' is required.", 400)
|
||||
if not query_text:
|
||||
return _error("'q' is required.", 400)
|
||||
|
||||
try:
|
||||
rows = search_rag_chunks(user_id=user_id, query_text=query_text, page=page, limit=limit)
|
||||
return jsonify({"results": rows, "page": page, "limit": min(max(1, limit), 100)})
|
||||
except Exception as e:
|
||||
return _error(str(e), 500)
|
||||
|
||||
|
||||
# ==================== Existing CRUD Routes ====================
|
||||
|
||||
@api.post("/users")
|
||||
def api_create_user():
|
||||
@@ -57,8 +374,6 @@ def api_get_user(user_id: int):
|
||||
return jsonify(user)
|
||||
|
||||
|
||||
# ==================== Audio Posts ====================
|
||||
|
||||
@api.post("/posts")
|
||||
def api_create_post():
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
@@ -112,8 +427,6 @@ def api_post_bundle(post_id: int):
|
||||
return jsonify(bundle)
|
||||
|
||||
|
||||
# ==================== Archive Files ====================
|
||||
|
||||
@api.post("/posts/<int:post_id>/files")
|
||||
def api_add_file(post_id: int):
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
@@ -133,8 +446,6 @@ def api_list_files(post_id: int):
|
||||
return _error(str(e), 500)
|
||||
|
||||
|
||||
# ==================== Metadata ====================
|
||||
|
||||
@api.put("/posts/<int:post_id>/metadata")
|
||||
def api_put_metadata(post_id: int):
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
@@ -156,8 +467,6 @@ def api_get_metadata(post_id: int):
|
||||
return jsonify(row)
|
||||
|
||||
|
||||
# ==================== Rights ====================
|
||||
|
||||
@api.put("/posts/<int:post_id>/rights")
|
||||
def api_put_rights(post_id: int):
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
@@ -175,8 +484,6 @@ def api_get_rights(post_id: int):
|
||||
return jsonify(row)
|
||||
|
||||
|
||||
# ==================== RAG Chunks ====================
|
||||
|
||||
@api.post("/posts/<int:post_id>/chunks")
|
||||
def api_add_chunks(post_id: int):
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
@@ -203,8 +510,6 @@ def api_get_chunks(post_id: int):
|
||||
return _error(str(e), 500)
|
||||
|
||||
|
||||
# ==================== Audit Log ====================
|
||||
|
||||
@api.post("/audit")
|
||||
def api_create_audit():
|
||||
payload = request.get_json(force=True, silent=False) or {}
|
||||
|
||||
@@ -63,6 +63,10 @@ def get_user_by_id(user_id: int) -> Optional[Dict[str, Any]]:
|
||||
return _first(supabase.table("users").select("*").eq("user_id", user_id).limit(1).execute())
|
||||
|
||||
|
||||
def get_user_by_email(email: str) -> Optional[Dict[str, Any]]:
|
||||
return _first(supabase.table("users").select("*").eq("email", email).limit(1).execute())
|
||||
|
||||
|
||||
# ==================== Audio Posts ====================
|
||||
|
||||
def create_audio_post(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@@ -102,6 +106,10 @@ def get_audio_post_by_id(post_id: int) -> Optional[Dict[str, Any]]:
|
||||
return _first(query.execute())
|
||||
|
||||
|
||||
def list_user_history(user_id: int, page: int = 1, limit: int = 20) -> List[Dict[str, Any]]:
|
||||
return list_audio_posts(page=page, limit=limit, user_id=user_id)
|
||||
|
||||
|
||||
def list_audio_posts(page: int = 1, limit: int = 20, visibility: Optional[str] = None, user_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
start, end = _paginate(page, limit)
|
||||
query = supabase.table("audio_posts").select("*, users(user_id, email, display_name, avatar_url)")
|
||||
@@ -267,6 +275,23 @@ def list_rag_chunks(post_id: int, page: int = 1, limit: int = 200) -> List[Dict[
|
||||
return _rows(response)
|
||||
|
||||
|
||||
def search_rag_chunks(user_id: int, query_text: str, page: int = 1, limit: int = 30) -> List[Dict[str, Any]]:
|
||||
start, end = _paginate(page, limit)
|
||||
response = (
|
||||
supabase.table("rag_chunks")
|
||||
.select(
|
||||
"chunk_id, post_id, start_sec, end_sec, text, confidence, created_at, "
|
||||
"audio_posts!inner(post_id, user_id, title, visibility, created_at)"
|
||||
)
|
||||
.eq("audio_posts.user_id", user_id)
|
||||
.ilike("text", f"%{query_text}%")
|
||||
.order("created_at", desc=True)
|
||||
.range(start, end)
|
||||
.execute()
|
||||
)
|
||||
return _rows(response)
|
||||
|
||||
|
||||
# ==================== Audit Log ====================
|
||||
|
||||
def add_audit_log(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
||||
BIN
backend/uploads/5bd199e8-fa0a-47e3-aca8-0a4e732c0610_data.m4a
Normal file
BIN
backend/uploads/5bd199e8-fa0a-47e3-aca8-0a4e732c0610_data.m4a
Normal file
Binary file not shown.
Reference in New Issue
Block a user