from flask import Flask, Response, jsonify, request import gevent import re import gevent.monkey import json from waitress import serve import logging logger = logging.getLogger("waitress") logger.setLevel(logging.INFO) gevent.monkey.patch_all() # import gevent.queue import pyttsx3 import sys import os import queue import sounddevice as sd # import fasttext from lingua import Language, LanguageDetectorBuilder detector = LanguageDetectorBuilder.from_all_spoken_languages().build() from deep_translator import ( MyMemoryTranslator, ) from vosk import Model, KaldiRecognizer, SetLogLevel # global variables SetLogLevel(-1) settings = None; app = Flask(__name__) if len(sys.argv) > 1: settingsPath = os.path.normpath(sys.argv[1]) environment = sys.argv[2] q = queue.Queue() # gobal functions def loadSettings(): with open(settingsPath, 'r') as file: global settings settings = json.load(file) # classes class LanguageDetection: def predict_lang(self, text): language = detector.compute_language_confidence_values(text) top_languages = [confidence.language.iso_code_639_1.name.lower() for confidence in language[:5]] # result = ','.join(top_languages) return top_languages class STT: samplerate = None args = "" remaining = "" def __init__(self): settings.read(settingsPath) device_info = sd.query_devices(int(settings["STT"]["MICROPHONE"]), "input") self.samplerate = int(device_info["default_samplerate"]) if environment == "dev": settings_folder = os.path.dirname(settingsPath) src_folder = os.path.dirname(settings_folder) main_folder = os.path.dirname(src_folder) vosk_model = os.path.join( main_folder, "speech_to_text_models", settings["STT"]["LANGUAGE"] ) else: resources_folder = os.path.dirname(settingsPath) vosk_model = os.path.join( resources_folder, "speech_to_text_models", settings["STT"]["LANGUAGE"] ) self.model = Model(rf"{vosk_model}") self.dump_fn = None self.q = gevent.queue.Queue() self.rec = None self.is_running = False def callback(self, indata, frames, time, status): if status: print(status, file=sys.stderr) self.q.put(bytes(indata)) def start_recognition(self): self.is_running = True with sd.RawInputStream( samplerate=self.samplerate, blocksize=8000, device=0, # Default microphone dtype="int16", channels=1, callback=self.callback, ): self.rec = KaldiRecognizer(self.model, self.samplerate) while True: data = self.q.get() if self.rec.AcceptWaveform(data): result = self.rec.Result() result_json = json.loads(str(result)) yield f"data: {result_json}\n\n" else: partialResult = self.rec.PartialResult() result_json = json.loads(str(partialResult)) yield f"data: {result_json}\n\n" def stop_recognition(self): self.is_running = False loadSettings() if settings["STT"]["USE_STT"] and settings["STT"]["LANGUAGE"] != '': speech_recognition_service = STT() class TTS: def __init__(self): self.engine = pyttsx3.init() def say(self, message, voice, count): voices = self.engine.getProperty("voices") for item in voices: if item.name == voice: matching_id = item.id break self.engine.setProperty("voice", matching_id) settings_folder = os.path.dirname(settingsPath) if environment == "dev": src_folder = os.path.dirname(settings_folder) bot_folder = os.path.dirname(src_folder) saveLocation = os.path.join( bot_folder, "sounds", f"Internal_{count}.mp3" ) else: saveLocation = os.path.join( settings_folder, "sounds", f"Internal_{count}.mp3" ) self.engine.save_to_file(message, saveLocation) self.engine.runAndWait() def voices(self): voices = self.engine.getProperty("voices") self.engine.say( "" ) # engine breaks if you do not say something after getting voices self.engine.runAndWait() return [voice.name for voice in voices] loadSettings() if settings["TTS"]["USE_TTS"]: text_to_speech_service = TTS() # endpoints @app.route("/stream", methods=["GET"]) def stream_recognition(): def generate(): return speech_recognition_service.start_recognition() return Response(generate(), content_type="text/event-stream") @app.route("/stop", methods=["POST"]) def stop_recording(): speech_recognition_service.stop_recognition() return Response("Speech recognition stopped", status=200) @app.route("/terminate", methods=["GET"]) def terminate_processes(): shutdown_server() os._exit(0) def shutdown_server(): # func = request.environ.get("sever shutdown") func = request.environ.get("werkzeug.server.shutdown") if func is None: raise RuntimeError("Server is not running") func() @app.route("/status", methods=["GET"]) def server_status(): return jsonify({"status": "server is running"}) @app.route("/detect", methods=["POST"]) def get_language(): try: request_data = request.json message = request_data.get("message", "") lang = LanguageDetection().predict_lang(message) return jsonify({"languages": lang}), 200 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/translate", methods=["POST"]) def get_translation(): try: loadSettings() request_data = request.json message = request_data.get("message", "") detectedLanguage = request_data.get("language", "") translated = MyMemoryTranslator( source=detectedLanguage, target=settings["LANGUAGE"]["TRANSLATE_TO"] ).translate(message) return jsonify({"translation": translated}), 200 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/audio", methods=["POST"]) def trigger_backend_event(): try: request_data = request.json message = request_data.get("message", "") filteredMessage = re.sub( r"https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "a link", message, ) voice = request_data.get("voice") count = request_data.get("count") text_to_speech_service.say(filteredMessage, voice, count) except Exception as e: return jsonify({"error": str(e)}), 500 return jsonify({"message": "Audio triggered"}), 200 @app.route("/voices", methods=["GET"]) def get_voices(): try: voices = text_to_speech_service.voices() return jsonify({"voices": voices}), 200 except Exception as e: return jsonify({"error": str(e)}), 500 if __name__ == "__main__": loadSettings() port = int(settings["GENERAL"]["PORT"]) if len(sys.argv) > 1 else 9000 serve(app, host="0.0.0.0", port=port)