from flask import Flask, Response, jsonify, request import gevent import gevent.monkey import json gevent.monkey.patch_all() import gevent.queue import configparser import pyttsx3 import sys import os import queue import sys import sounddevice as sd import fasttext from deep_translator import ( MyMemoryTranslator, ) import emoji from vosk import Model, KaldiRecognizer, SetLogLevel # global variables SetLogLevel(-1) settings = configparser.ConfigParser() app = Flask(__name__) if len(sys.argv) > 1: settingsPath = os.path.normpath(sys.argv[1]) environment = sys.argv[2] q = queue.Queue() # gobal functions # classes class LanguageDetection: def __init__(self): if environment == "dev": settings_folder = os.path.dirname(settingsPath) src_folder = os.path.dirname(settings_folder) main_folder = os.path.dirname(src_folder) language_detection_model = os.path.join( main_folder, "language_detection_model", f"lid.176.bin" ) else: resources_folder = os.path.dirname(settingsPath) language_detection_model = os.path.join( resources_folder, "language_detection_model", f"lid.176.bin" ) language_detection_model = ( rf"{language_detection_model}" ) self.model = fasttext.load_model(language_detection_model) def predict_lang(self, text): predictions = self.model.predict(text, k=5) # returns top 2 matching languages language_codes = [] for prediction in predictions[0]: language_codes.append(prediction.replace("__label__", "")) return language_codes class STT: samplerate = None args = "" remaining = "" def __init__(self): settings.read(settingsPath) device_info = sd.query_devices(int(settings["STT"]["MICROPHONE"]), "input") self.samplerate = int(device_info["default_samplerate"]) if environment == "dev": settings_folder = os.path.dirname(settingsPath) src_folder = os.path.dirname(settings_folder) main_folder = os.path.dirname(src_folder) vosk_model = os.path.join( main_folder, "speech_to_text_models", settings["STT"]["LANGUAGE"] ) else: resources_folder = os.path.dirname(settingsPath) vosk_model = os.path.join( resources_folder, "speech_to_text_models", settings["STT"]["LANGUAGE"] ) self.model = Model( rf"{vosk_model}" ) self.dump_fn = None self.q = gevent.queue.Queue() self.rec = None self.is_running = False def callback(self, indata, frames, time, status): if status: print(status, file=sys.stderr) self.q.put(bytes(indata)) def start_recognition(self): self.is_running = True with sd.RawInputStream( samplerate=self.samplerate, blocksize=8000, device=0, # Default microphone dtype="int16", channels=1, callback=self.callback, ): self.rec = KaldiRecognizer(self.model, self.samplerate) while True: data = self.q.get() if self.rec.AcceptWaveform(data): result = self.rec.Result() result_json = json.loads(str(result)) yield f"data: {result_json}\n\n" else: partialResult = self.rec.PartialResult() result_json = json.loads(str(partialResult)) yield f"data: {result_json}\n\n" def stop_recognition(self): self.is_running = False speech_recognition_service = STT() class TTS: engine = None rate = None def __init__(self): self.engine = pyttsx3.init() def say(self, message, voice, count): voices = self.engine.getProperty("voices") for item in voices: if item.name == voice: matching_id = item.id break self.engine.setProperty("voice", matching_id) if environment == "dev": settings_folder = os.path.dirname(settingsPath) src_folder = os.path.dirname(settings_folder) saveLocation = os.path.join( src_folder, "sounds\\tts", f"Internal_{count}.mp3" ) else: resources_folder = os.path.dirname(settingsPath) saveLocation = os.path.join( resources_folder, "sounds\\tts", f"Internal_{count}.mp3" ) self.engine.save_to_file(message, saveLocation) self.engine.runAndWait() def voices(self): voices = self.engine.getProperty("voices") self.engine.say( "" ) # engine breaks if you do not say something after getting voices self.engine.runAndWait() return [voice.name for voice in voices] text_to_speech_service = TTS() # endpoints @app.route("/stream", methods=["GET"]) def stream_recognition(): def generate(): return speech_recognition_service.start_recognition() return Response(generate(), content_type="text/event-stream") @app.route("/stop", methods=["POST"]) def stop_recording(): speech_recognition_service.stop_recognition() return Response("Speech recognition stopped", status=200) # @app.before_request # def custom_warning(): # if environment == "dev": # print( # # "Running in internal development environment. This server is not for production use." # ) @app.route("/terminate", methods=["GET"]) def terminate_processes(): shutdown_server() os._exit(0) def shutdown_server(): func = request.environ.get("sever shutdown") if func is None: raise RuntimeError("Server is not running") func() # @app.route("/detect", methods=["POST"]) # def server_status(): # try: # request_data = request.json # message = request_data.get("message", "") # confidence_values = detector.compute_language_confidence_values(message) # for language, value in confidence_values: # print(f"{language.name}: {value:.2f}") # message = request_data.get("message", "") # except Exception as e: # return jsonify({"error": "An error occurred"}), 500 # return jsonify({"message": "Audio triggered"}), 200 @app.route("/status", methods=["GET"]) def server_status(): return jsonify({"status": "server is running"}) @app.route("/audio", methods=["POST"]) def trigger_backend_event(): try: request_data = request.json message = request_data.get("message", "") voice = request_data.get("voice") count = request_data.get("count") text_to_speech_service.say(message, voice, count) except Exception as e: return jsonify({"error": "An error occurred"}), 500 return jsonify({"message": "Audio triggered"}), 200 @app.route("/voices", methods=["GET"]) def get_voices(): try: voices = text_to_speech_service.voices() return jsonify({"voices": voices}), 200 except Exception as e: return jsonify({"error": "An error occurred"}), 500 if __name__ == "__main__": LANGUAGE = LanguageDetection() lang = LANGUAGE.predict_lang("hola cómo estás") print(lang) text = "Keep it up. You are awesome" translated = MyMemoryTranslator( source="english", target="spanish latin america" ).translate(text) print(translated) if len(sys.argv) > 1: settings.read(settingsPath) port = int(settings["GENERAL"]["PORT"]) else: environment = "dev" port = 9000 stream_recognition() app.run(host="127.0.0.1", port=port) app.terminate()