feat: Add record.py audio recording and transcription service from PR #42

Steve · Steve · commit 330e53191c21 · 2025-06-22T02:20:40.000-04:00
- Audio recording from VDO.Ninja rooms
- Automatic transcription using Whisper AI
- FastAPI REST endpoints for start/stop recording
- Process monitoring with 1-hour timeout
- Systemd service configuration
- HTML templates for web interface
- Transcriptions saved to stt/ directory

Co-authored-by: astroport contributor
diff --git a/.gitignore b/.gitignore
@@ -177,3 +177,8 @@ Thumbs.db
 .Trash-*
 .nfs*
 ndi/gst-plugin-ndi/
+
+# record.py whisper transcriptions output directory
+*_audio.ts
+stt/*.txt
+stt/*.json
diff --git a/README.md b/README.md
@@ -696,6 +696,68 @@ Please note, the raspberry_ninja publish.py script can both send and recieve MID
 
 midi demo video: https://youtu.be/Gry9UFtOTmQ
 
+## `record.py` - Audio Recording and Transcription Service
+
+The `record.py` microservice provides audio recording and automatic transcription capabilities using OpenAI's Whisper AI model.
+
+### Features
+
+- **Audio Recording**: Record audio streams from VDO.Ninja rooms
+- **Automatic Transcription**: Transcribe recordings using Whisper AI
+- **REST API**: Start/stop recordings via HTTP endpoints
+- **Process Monitoring**: Automatic timeout to prevent runaway recordings
+- **Command Line Interface**: Direct CLI usage for recording
+
+### Prerequisites
+
+```bash
+# Install Whisper and dependencies
+pip3 install openai-whisper fastapi uvicorn
+```
+
+### Usage
+
+#### Start the FastAPI Server
+
+```bash
+python3 record.py --host 0.0.0.0 --port 8000
+```
+
+#### Start Recording (API)
+
+```bash
+curl -X POST -F "room=myRoom" -F "record=myRecord" http://localhost:8000/rec
+```
+
+#### Stop Recording (API)
+
+```bash
+curl -X POST -F "record=myRecord" -F "process_pid=<PID>" -F "language=en" http://localhost:8000/stop
+```
+
+#### Command Line Recording
+
+```bash
+# Start recording
+python3 record.py --room myRoom --record myRecord
+
+# Stop recording and transcribe
+python3 record.py --stop --pid <PID> --record myRecord --language en
+```
+
+### Systemd Service
+
+To run as a system service:
+
+```bash
+sudo ./setup.ninja_record.systemd.sh
+```
+
+### Output
+
+- Audio files: Saved as `<record_id>_audio.ts`
+- Transcriptions: Saved in `stt/` directory as text files
+
 ### Note:
 
 - Installation from source is pretty slow and problematic on a rpi; using system images makes using this so much easier.
diff --git a/record.py b/record.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+from fastapi import FastAPI, Request, Form, HTTPException
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.templating import Jinja2Templates
+import whisper
+import subprocess
+import random
+import os
+import logging
+import glob
+import argparse
+import threading
+import time
+import uvicorn 
+
+# Configurer la journalisation
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
+templates = Jinja2Templates(directory="templates")
+model = whisper.load_model("medium")
+
+# Liste pour suivre les processus
+processes = []
+
+# Fonction de surveillance des processus
+def monitor_processes():
+    while True:
+        current_time = time.time()
+        for process_info in processes:
+            process, start_time = process_info
+            if current_time - start_time > 3600:  # 3600 secondes = 1 heure
+                logger.info("Killing process with PID: %d due to timeout", process.pid)
+                process.kill()
+                processes.remove(process_info)
+        time.sleep(60)  # Vérifier toutes les minutes
+
+# Démarrer le thread de surveillance
+monitor_thread = threading.Thread(target=monitor_processes, daemon=True)
+monitor_thread.start()
+
+@app.get("/", response_class=HTMLResponse)
+async def index(request: Request, room: str = "", record: str = ""):
+    logger.info("Serving index page with room: %s (%s)", room, record)
+    return templates.TemplateResponse("index.html", {"request": request, "room": room, "record": record})
+
+@app.api_route("/rec", methods=["GET", "POST"])
+async def start_recording(request: Request, room: str = Form(None), record: str = Form(None)):
+    room = room or request.query_params.get("room")
+    record = record or request.query_params.get("record")
+    
+    if not room or not record:
+        raise HTTPException(status_code=400, detail="Room and record parameters must not be empty")
+
+    logger.info("Starting recording for room: %s with record ID: %s", room, record)
+    
+    # Créer un pipe pour rediriger les logs de publish.py
+    read_pipe, write_pipe = os.pipe()
+
+    # Lancer l'enregistrement audio en arrière-plan avec les logs redirigés vers le pipe
+    process = subprocess.Popen(["python3", "publish.py", "--room", room, "--record", record, "--novideo"], stdout=write_pipe, stderr=write_pipe)
+    logger.info("Started publish.py process with PID: %d", process.pid)
+
+    # Fermer le côté écriture du pipe dans le processus parent
+    os.close(write_pipe)
+
+    # Ajouter le processus à la liste avec l'heure de début
+    processes.append((process, time.time()))
+
+    # Afficher un bouton pour ouvrir la nouvelle page de visioconférence
+    return templates.TemplateResponse("recording.html", {"request": request, "room": room, "record": record, "process_pid": process.pid})
+
+@app.post("/stop")
+async def stop_recording(record: str = Form(...), process_pid: int = Form(...), language: str = Form(...)):
+    logger.info("Stopping recording for record ID: %s with process PID: %d", record, process_pid)
+    
+    # Arrêter le processus d'enregistrement
+    process = subprocess.Popen(["kill", str(process_pid)])
+    process.wait()
+    logger.info("Stopped publish.py process with PID: %d", process_pid)
+
+    # Trouver le fichier audio correspondant
+    audio_files = glob.glob(f"{record}_*_audio.ts")
+    if not audio_files:
+        logger.error("No audio file found for record ID: %s", record)
+        return {"error": f"No audio file found for record ID: {record}"}
+    
+    audio_file = audio_files[0]
+    logger.info("Transcribing audio file: %s", audio_file)
+    
+    try:
+        speech = model.transcribe(audio_file, language=language)['text']
+        logger.info("Transcription completed for record ID: %s", record)
+    except Exception as e:
+        logger.error("Failed to transcribe audio file: %s", str(e))
+        return {"error": f"Failed to transcribe audio file: {str(e)}"}
+
+    # Écrire la transcription dans un fichier texte
+    transcript_file = f"stt/{record}_speech.txt"
+    with open(transcript_file, "w") as f:
+        f.write(speech)
+    logger.info("Transcription saved to: %s", transcript_file)
+
+    # Supprimer le fichier audio
+    os.remove(audio_file)
+    logger.info("Audio file %s removed.", audio_file)
+
+    return {"transcription": speech}
+
+@app.get("/stt")
+async def get_transcription(id: str):
+    transcript_file = f"stt/{id}_speech.txt"
+    if not os.path.exists(transcript_file):
+        logger.error("No transcription file found for record ID: %s", id)
+        return JSONResponse(status_code=404, content={"error": f"No transcription file found for record ID: {id}"})
+    
+    with open(transcript_file, "r") as f:
+        transcription = f.read()
+    
+    # Ajouter le fichier à IPFS
+    try:
+        logger.info(f"Adding file to IPFS: {transcript_file}")
+        result = subprocess.run(["ipfs", "add", transcript_file], capture_output=True, text=True)
+        cid = result.stdout.split()[1]
+        logger.info("Added file to IPFS: %s with CID: %s", transcript_file, cid)
+    except Exception as e:
+        logger.error("Failed to add file to IPFS: %s", str(e))
+        return JSONResponse(status_code=500, content={"error": f"Failed to add file to IPFS: {str(e)}"})
+    
+    logger.info("Returning transcription and CID for record ID: %s", id)
+    return {"transcription": transcription, "cid": cid}
+
+def start_recording_cli(room, record):
+    logger.info("Starting recording for room: %s with record ID: %s", room, record)
+    
+    # Créer un pipe pour rediriger les logs de publish.py
+    read_pipe, write_pipe = os.pipe()
+
+    # Lancer l'enregistrement audio en arrière-plan avec les logs redirigés vers le pipe
+    process = subprocess.Popen(["python3", "publish.py", "--room", room, "--record", record, "--novideo"], stdout=write_pipe, stderr=write_pipe)
+    logger.info("Started publish.py process with PID: %d", process.pid)
+
+    # Fermer le côté écriture du pipe dans le processus parent
+    os.close(write_pipe)
+
+    # Ajouter le processus à la liste avec l'heure de début
+    processes.append((process, time.time()))
+
+    return process.pid
+
+def stop_recording_cli(record, process_pid, language):
+    logger.info("Stopping recording for record ID: %s with process PID: %d", record, process_pid)
+    
+    # Arrêter le processus d'enregistrement
+    process = subprocess.Popen(["kill", str(process_pid)])
+    process.wait()
+    logger.info("Stopped publish.py process with PID: %d", process_pid)
+
+    # Trouver le fichier audio correspondant
+    audio_files = glob.glob(f"{record}_*_audio.ts")
+    if not audio_files:
+        logger.error("No audio file found for record ID: %s", record)
+        return {"error": f"No audio file found for record ID: {record}"}
+    
+    audio_file = audio_files[0]
+    logger.info("Transcribing audio file: %s", audio_file)
+    
+    try:
+        speech = model.transcribe(audio_file, language=language)['text']
+        logger.info("Transcription completed for record ID: %s", record)
+    except Exception as e:
+        logger.error("Failed to transcribe audio file: %s", str(e))
+        return {"error": f"Failed to transcribe audio file: {str(e)}"}
+
+    # Écrire la transcription dans un fichier texte
+    transcript_file = f"stt/{record}_speech.txt"
+    with open(transcript_file, "w") as f:
+        f.write(speech)
+    logger.info("Transcription saved to: %s", transcript_file)
+
+    # Supprimer le fichier audio
+    os.remove(audio_file)
+    logger.info("Audio file %s removed.", audio_file)
+
+    return {"transcription": speech}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Démarrer le serveur FastAPI avec des paramètres personnalisés.")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Adresse hôte pour le serveur FastAPI.")
+    parser.add_argument("--port", type=int, default=9000, help="Port pour le serveur FastAPI.")
+    parser.add_argument("--room", type=str, help="Room name for the recording session.")
+    parser.add_argument("--record", type=str, help="Record ID for the session.")
+    parser.add_argument("--stop", action="store_true", help="Stop the recording.")
+    parser.add_argument("--pid", type=int, help="Process PID to stop.")
+    parser.add_argument("--language", type=str, default="en", help="Language for transcription.")
+    args = parser.parse_args()
+
+    if args.room and args.record and not args.stop:
+        pid = start_recording_cli(args.room, args.record)
+        print(f"Recording started with PID: {pid}")
+    elif args.stop and args.pid and args.record:
+        result = stop_recording_cli(args.record, args.pid, args.language)
+        print(result)
+    else:
+        logger.info("Starting FastAPI server")
+        uvicorn.run(app, host=args.host, port=args.port)
diff --git a/setup.ninja_record.systemd.sh b/setup.ninja_record.systemd.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -euo pipefail
+[ $(id -u) -eq 0 ] && echo "LANCEMENT root INTERDIT (use sudo user). " && exit 1
+cat templates/record.service.tpl | sed "s~_USER_~$USER~g" | sed "s~_MY_PATH_~$(pwd)~" > /tmp/ninja_record.service
+
+cat /tmp/ninja_record.service
+sudo cp /tmp/ninja_record.service /etc/systemd/system/ninja_record.service
+
+sudo systemctl daemon-reload
+sudo systemctl enable ninja_record
+sudo systemctl restart ninja_record
diff --git a/stt/.readme b/stt/.readme
@@ -0,0 +1 @@
+Transcriptions will be saved here
diff --git a/templates/index.html b/templates/index.html
@@ -0,0 +1,61 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Vdo.Ninja Audio to AI</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            background-color: #f0f0f0;
+            margin: 0;
+            padding: 0;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            height: 100vh;
+        }
+        .container {
+            background-color: #fff;
+            padding: 20px;
+            border-radius: 8px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+            text-align: center;
+        }
+        h1 {
+            color: #333;
+        }
+        p {
+            color: #666;
+        }
+        input[type="text"] {
+            padding: 10px;
+            margin: 10px 0;
+            border: 1px solid #ccc;
+            border-radius: 4px;
+            width: 100%;
+        }
+        button {
+            padding: 10px 20px;
+            background-color: #007bff;
+            color: #fff;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+        }
+        button:hover {
+            background-color: #0056b3;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>VDO.ninja audio to text</h1>
+        <p>Enter vdo.ninja room and push channel</p>
+        <form action="/rec" method="post">
+            <input type="text" name="room" placeholder="Room Name"  value="{{ room }}" required>
+            <input type="text" name="record" placeholder="Record ID" value="{{ record }}" required>
+            <button type="submit">Start Recording</button>
+        </form>
+        <p>code : <a href="https://github.com/papiche/raspberry_ninja/">https://github.com/papiche/raspberry_ninja/</a></p>
+    </div>
+</body>
+</html>
diff --git a/templates/record.service.tpl b/templates/record.service.tpl
@@ -0,0 +1,13 @@
+[Unit]
+Description=Record Vdo Ninja STT
+After=network.target
+
+[Service]
+User=_USER_
+Group=_USER_
+WorkingDirectory=_MY_PATH_
+ExecStart=/usr/bin/python3 _MY_PATH_/record.py --host 0.0.0.0 --port 9000
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
diff --git a/templates/recording.html b/templates/recording.html