backend for speech to text

This commit is contained in:
Gaumit Kauts
2026-02-14 16:23:34 -07:00
parent 462948b849
commit 001220d0aa
3 changed files with 226 additions and 25 deletions

View File

@@ -1,2 +1,49 @@
# AI-Titan-Forge
### CalgaryHack26 project
## Backend (Audio -> Whisper -> Supabase)
This backend:
1. accepts an audio file,
2. transcribes it with OpenAI Whisper (`whisper-1`),
3. stores transcript text in Supabase `posts.transcribed_text`,
4. links categories in `post_categories`.
## Install
```bash
pip install -r requirements.txt
```
## Environment variables
- `OPENAI_API_KEY`
- `SUPABASE_URL`
- `SUPABASE_SERVICE_ROLE_KEY` (use service-role key on backend only)
- `UPLOAD_DIR` (default: `uploads`)
- `PORT` (default: `5000`)
## Run
```bash
python speech_to_text.py
```
## Endpoints
- `GET /health`
- `GET /health/db`
- `POST /upload-audio`
## Upload example
```bash
curl -X POST http://localhost:5000/upload-audio \
-F "file=@sample.mp3" \
-F "user_id=1" \
-F "title=My oral history" \
-F "category_ids=1,4" \
-F "is_private=false"
```
## Required tables in Supabase
Your Supabase Postgres project should already contain:
- `users`
- `posts`
- `post_categories`
- `categories`
Note: `user_id` must exist in `users` before upload.

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
flask
openai
supabase
python-dotenv
werkzeug

View File

@@ -1,36 +1,185 @@
import os
from flask import Flask, request, jsonify
import os
import uuid
from pathlib import Path
from flask import Flask, jsonify, request
from openai import OpenAI
from supabase import Client, create_client
from werkzeug.utils import secure_filename
app = Flask(__name__)
client = OpenAI() # reads OPENAI_API_KEY from env
def transcribe_with_timestamps(local_path: str):
# Use whisper-1 for verbose_json + timestamp_granularities (segment/word)
# timestamp_granularities requires response_format="verbose_json" :contentReference[oaicite:1]{index=1}
with open(local_path, "rb") as f:
tr = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="verbose_json",
timestamp_granularities=["segment"],
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "uploads"))
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
ALLOWED_EXTENSIONS = {"mp3", "wav", "m4a", "ogg", "webm", "flac", "mp4"}
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
raise RuntimeError(
"Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY environment variables."
)
segments = []
for seg in tr.segments:
segments.append({
"start_ms": int(seg["start"] * 1000),
"end_ms": int(seg["end"] * 1000),
"text": seg["text"].strip(),
})
return segments
supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
def allowed_file(filename: str) -> bool:
if "." not in filename:
return False
extension = filename.rsplit(".", 1)[1].lower()
return extension in ALLOWED_EXTENSIONS
def parse_bool(value: str | None, default: bool = False) -> bool:
if value is None:
return default
return value.lower() in {"1", "true", "yes", "on"}
def parse_category_ids(value: str | None) -> list[int]:
if not value:
return []
ids: list[int] = []
for item in value.split(","):
candidate = item.strip()
if not candidate:
continue
ids.append(int(candidate))
return ids
def transcribe_audio(local_path: Path) -> str:
with local_path.open("rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text",
)
if isinstance(transcript, str):
return transcript.strip()
text_value = getattr(transcript, "text", "")
return str(text_value).strip()
def verify_supabase_connection() -> None:
supabase.table("categories").select("category_id").limit(1).execute()
def insert_post(
*,
user_id: int,
title: str | None,
transcribed_text: str,
audio_url: str,
is_private: bool,
image_url: str | None,
category_ids: list[int],
) -> int:
post_payload = {
"user_id": user_id,
"title": title,
"transcribed_text": transcribed_text,
"audio_url": audio_url,
"is_private": is_private,
"image_url": image_url,
}
post_response = supabase.table("posts").insert(post_payload).execute()
post_rows = getattr(post_response, "data", None) or []
if not post_rows:
raise RuntimeError("Supabase insert failed for posts table.")
post_id = int(post_rows[0]["post_id"])
if category_ids:
category_rows = [
{"post_id": post_id, "category_id": category_id}
for category_id in category_ids
]
supabase.table("post_categories").insert(category_rows).execute()
return post_id
@app.get("/health")
def health_check():
return jsonify({"status": "ok"})
@app.get("/health/db")
def db_health_check():
try:
verify_supabase_connection()
return jsonify({"status": "ok", "database": "supabase"})
except Exception as error:
return jsonify({"status": "error", "details": str(error)}), 500
@app.post("/upload-audio")
def upload_audio():
if "file" not in request.files:
return jsonify({"error": "Missing 'file' in form-data."}), 400
file = request.files["file"]
local_path = f"/tmp/{file.filename}"
if not file.filename:
return jsonify({"error": "Filename is empty."}), 400
if not allowed_file(file.filename):
return jsonify({"error": "Unsupported file extension."}), 400
user_id_raw = request.form.get("user_id")
if not user_id_raw:
return jsonify({"error": "'user_id' is required in form-data."}), 400
try:
user_id = int(user_id_raw)
except ValueError:
return jsonify({"error": "'user_id' must be an integer."}), 400
try:
category_ids = parse_category_ids(request.form.get("category_ids"))
except ValueError:
return jsonify(
{"error": "'category_ids' must be a comma-separated list of integers."}
), 400
title = request.form.get("title")
image_url = request.form.get("image_url")
is_private = parse_bool(request.form.get("is_private"), default=False)
safe_name = secure_filename(file.filename)
unique_name = f"{uuid.uuid4()}_{safe_name}"
local_path = UPLOAD_DIR / unique_name
file.save(local_path)
segments = transcribe_with_timestamps(local_path)
try:
transcript_text = transcribe_audio(local_path)
post_id = insert_post(
user_id=user_id,
title=title,
transcribed_text=transcript_text,
audio_url=str(local_path).replace("\\", "/"),
is_private=is_private,
image_url=image_url,
category_ids=category_ids,
)
except Exception as error:
return jsonify({"error": "Failed to process audio", "details": str(error)}), 500
# TODO: insert `segments` into transcript_segments table
return jsonify({"segments": segments})
return jsonify(
{
"message": "Audio uploaded, transcribed, and saved to Supabase.",
"post_id": post_id,
"transcribed_text": transcript_text,
"audio_url": str(local_path).replace("\\", "/"),
}
)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=int(os.getenv("PORT", "5000")), debug=True)