From 462948b849b1a6530d17340137dda7a27dc0f96b Mon Sep 17 00:00:00 2001 From: Gaumit Kauts <123269559+Gaumit-Kauts@users.noreply.github.com> Date: Sat, 14 Feb 2026 16:10:19 -0700 Subject: [PATCH] created speech_to_text Using openai whisper model for converting speech to text --- speech_to_text.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 speech_to_text.py diff --git a/speech_to_text.py b/speech_to_text.py new file mode 100644 index 0000000..d4fd32c --- /dev/null +++ b/speech_to_text.py @@ -0,0 +1,36 @@ +import os +from flask import Flask, request, jsonify +from openai import OpenAI + +app = Flask(__name__) +client = OpenAI() # reads OPENAI_API_KEY from env + +def transcribe_with_timestamps(local_path: str): + # Use whisper-1 for verbose_json + timestamp_granularities (segment/word) + # timestamp_granularities requires response_format="verbose_json" :contentReference[oaicite:1]{index=1} + with open(local_path, "rb") as f: + tr = client.audio.transcriptions.create( + model="whisper-1", + file=f, + response_format="verbose_json", + timestamp_granularities=["segment"], + ) + segments = [] + for seg in tr.segments: + segments.append({ + "start_ms": int(seg["start"] * 1000), + "end_ms": int(seg["end"] * 1000), + "text": seg["text"].strip(), + }) + return segments + +@app.post("/upload-audio") +def upload_audio(): + file = request.files["file"] + local_path = f"/tmp/{file.filename}" + file.save(local_path) + + segments = transcribe_with_timestamps(local_path) + + # TODO: insert `segments` into transcript_segments table + return jsonify({"segments": segments})