rag vector embeddings

2026-02-15 01:21:05 -07:00
parent 6e5b4850b9
commit 5f471c21be
5 changed files with 65 additions and 6 deletions
--- a/backend/pycache/api_routes.cpython-311.pyc
+++ b/backend/pycache/api_routes.cpython-311.pyc
--- a/backend/pycache/db_queries.cpython-311.pyc
+++ b/backend/pycache/db_queries.cpython-311.pyc
--- a/backend/api_routes.py
+++ b/backend/api_routes.py
@@ -7,7 +7,9 @@ import hashlib
 import json
 import os
 import uuid
+import re
 from pathlib import Path
+from typing import Any, List

 from dotenv import load_dotenv
 from faster_whisper import WhisperModel
@@ -34,6 +36,7 @@ from db_queries import (
    list_rag_chunks,
    list_user_history,
    search_rag_chunks,
+    search_rag_chunks_vector,
    update_audio_post,
    upload_storage_object,
    upsert_archive_metadata,
@@ -96,7 +99,7 @@ def _build_prompt(transcript_text: str, title: str) -> str:
        f"{transcript_text}\n\n"
        "Answer user questions grounded in this transcript."
    )
-def _add_audio_url(post: Dict[str, Any]) -> Dict[str, Any]:
+def _add_audio_url(post: dict[str, Any]) -> dict[str, Any]:
    """Add signed audio URL to post if ready"""
    if post.get("status") == "ready":
        try:
@@ -107,6 +110,29 @@ def _add_audio_url(post: Dict[str, Any]) -> Dict[str, Any]:
    return post


+def _local_embedding(text: str, dimensions: int = 1536) -> List[float]:
+    """
+    Free deterministic embedding fallback (offline).
+    Replace with model-based embeddings later if needed.
+    """
+    vector = [0.0] * dimensions
+    tokens = re.findall(r"[A-Za-z0-9']+", text.lower())
+    if not tokens:
+        return vector
+
+    for token in tokens:
+        digest = hashlib.sha256(token.encode("utf-8")).digest()
+        idx = int.from_bytes(digest[:4], "big") % dimensions
+        sign = 1.0 if (digest[4] & 1) == 0 else -1.0
+        weight = 1.0 + (digest[5] / 255.0) * 0.25
+        vector[idx] += sign * weight
+
+    norm = sum(v * v for v in vector) ** 0.5
+    if norm > 0:
+        vector = [v / norm for v in vector]
+    return vector
+
+

@api.get("/health")
 def health():
@@ -274,7 +300,7 @@ def api_upload_post():
                    "end_sec": float(seg.end),
                    "text": segment_text,
                    "confidence": float(seg.avg_logprob) if seg.avg_logprob is not None else None,
-                    "embedding": None,
+                    "embedding": _local_embedding(segment_text),
                }
            )

@@ -360,17 +386,31 @@ def api_user_history(user_id: int):
 def api_rag_search():
    query_text = (request.args.get("q") or "").strip()
    user_id = request.args.get("user_id", type=int)
+    query_embedding_raw = request.args.get("query_embedding")
    page = request.args.get("page", default=1, type=int)
    limit = request.args.get("limit", default=30, type=int)

    if not user_id:
        return _error("'user_id' is required.", 400)
-    if not query_text:
-        return _error("'q' is required.", 400)

    try:
+        if query_embedding_raw:
+            try:
+                parsed = json.loads(query_embedding_raw)
+                if not isinstance(parsed, list):
+                    return _error("'query_embedding' must be a JSON array.", 400)
+                query_embedding = [float(v) for v in parsed]
+            except Exception:
+                return _error("Invalid 'query_embedding'. Example: [0.1,0.2,...]", 400)
+
+            rows = search_rag_chunks_vector(user_id=user_id, query_embedding=query_embedding, limit=limit)
+            return jsonify({"results": rows, "mode": "vector", "limit": min(max(1, limit), 100)})
+
+        if not query_text:
+            return _error("'q' is required when 'query_embedding' is not provided.", 400)
+
        rows = search_rag_chunks(user_id=user_id, query_text=query_text, page=page, limit=limit)
-        return jsonify({"results": rows, "page": page, "limit": min(max(1, limit), 100)})
+        return jsonify({"results": rows, "mode": "text", "page": page, "limit": min(max(1, limit), 100)})
    except Exception as e:
        return _error(str(e), 500)

--- a/backend/db_queries.py
+++ b/backend/db_queries.py
@@ -322,6 +322,9 @@ def add_rag_chunks(post_id: int, chunks: List[Dict[str, Any]]) -> List[Dict[str,

    rows = []
    for c in chunks:
+        embedding = c.get("embedding")
+        if isinstance(embedding, list):
+            embedding = "[" + ",".join(str(float(v)) for v in embedding) + "]"
        rows.append(
            {
                "post_id": post_id,
@@ -329,7 +332,7 @@ def add_rag_chunks(post_id: int, chunks: List[Dict[str, Any]]) -> List[Dict[str,
                "end_sec": c.get("end_sec"),
                "text": c.get("text"),
                "confidence": c.get("confidence"),
-                "embedding": c.get("embedding"),
+                "embedding": embedding,
            }
        )

@@ -367,6 +370,22 @@ def search_rag_chunks(user_id: int, query_text: str, page: int = 1, limit: int =
    return _rows(response)


+def search_rag_chunks_vector(user_id: int, query_embedding: List[float], limit: int = 30) -> List[Dict[str, Any]]:
+    """
+    Vector search via SQL RPC function `match_rag_chunks` (pgvector).
+    """
+    vector_text = "[" + ",".join(str(float(v)) for v in query_embedding) + "]"
+    response = supabase.rpc(
+        "match_rag_chunks",
+        {
+            "p_user_id": user_id,
+            "p_query_embedding": vector_text,
+            "p_match_count": min(max(1, limit), 100),
+        },
+    ).execute()
+    return _rows(response)
+
+
 # ==================== Audit Log ====================

 def add_audit_log(payload: Dict[str, Any]) -> Dict[str, Any]:
--- a/backend/uploads/e14987f3-5d75-45da-948f-51e640cb0b5d_data.m4a
+++ b/backend/uploads/e14987f3-5d75-45da-948f-51e640cb0b5d_data.m4a