/* Developed by Inventives, Inc. <https://inventives.ai> */
/* See LICENSE.md file in project root directory */
import { Transcription, TranscriptionEvent } from './Transcription';
import { FIFO } from 'util/fifo';

/** Event type for when voice is activated */
export class VoiceOnEvent extends Event {
    readonly transcription: Transcription
    constructor(transcription: Transcription) {
        super('voice_on');
        this.transcription = transcription;
    }
}

/** Event type for when voice is deactivated */
export class VoiceOffEvent extends Event {
    constructor() {
        super('voice_off');
    }
}

/** Event type for when a new volume average is calculated */
export class VolumeEvent extends Event {
    readonly volume: number;
    readonly smooth: number;
    constructor(volume: number, smooth: number) {
        super('volume');
        this.volume = volume;
        this.smooth = smooth;
    }
}

/** Event type for when the processing encounters an error */
export class SpeechProcessorError extends Event {
    readonly error: any;
    constructor(error: any) {
        super('error');
        this.error = error;
    }
}

/** SpeechProcessor class constructor options */
export interface SpeechProcessorOptions {
    // Custom stream object
    stream?: MediaStream

    // Voice activity detector tuning parameters
    filterSize?: number
    filterGain?: number
    filterLimit?: number
    trigger?: number
    diedown?: number
    bufferSize?: number
    minDuration?: number
    minCharacters?: number
}

// So this works, sorta. https://dev.to/43081j/strongly-typed-event-emitters-using-eventtarget-in-typescript-3658
// I'm just doing the type casting in the EventTarget itself
interface SpeechProcessorEvents {
    'transcript': TranscriptionEvent;
    'error': SpeechProcessorError;
    'voice_on': VoiceOnEvent;
    'voice_off': VoiceOffEvent;
    'volume': VolumeEvent;
}

/** SpeechProcessor class that detects voice activity from microphone input, transcribes utterances */
export class SpeechProcessor extends EventTarget {
    token: string
    stream?: MediaStream
    transcriptions: Transcription[]
    disconnect?: Function

    // Voice activity detection params
    filterSize: number
    filterGain: number
    filterLimit: number
    trigger: number
    diedown: number
    bufferSize: number

    // Transcription params
    minDuration: number
    minCharacters: number

    enabled: boolean;
    muted: boolean;

    // Active keywords to listen for
    keywords: string[]

    // function to check if we should transcribe audio
    private shouldTranscribe: () => Promise<boolean>

    constructor(token: string, opts?: SpeechProcessorOptions) {
        super();
        this.token = token;
        this.stream = opts?.stream;
        this.transcriptions = [];

        // Default VAD parameters
        this.filterSize = opts?.filterSize ?? 1024*8;
        this.filterGain = opts?.filterGain ?? 0.15;
        this.filterLimit = opts?.filterLimit ?? 0.05;
        this.trigger = opts?.trigger ?? 0.03;
        this.diedown = opts?.diedown ?? 0.01;
        this.bufferSize = opts?.bufferSize ?? 5;

        // Transcription parameters
        this.minDuration = opts?.minDuration ?? 1;
        this.minCharacters = opts?.minCharacters ?? 2;

        this.enabled = true;
        this.muted = false;
        this.keywords = [];

        // Default transcription check function
        this.shouldTranscribe = async () => true;
    }

    /** If a stream is not prepared and passed into the constructor, you can call the SpeechProcessor.setupMic function to configure the detault mic */
    setupMic = async (constraints?: MediaStreamConstraints) => {
        this.stream = await navigator.mediaDevices.getUserMedia({
            audio: {
                sampleRate: 48000
            },
            ...(constraints ?? {})
        });
    }

    /** Start processor speech! Pass any optional keywords to search for and boost in transcription */
    start = () => {
        if (!this.stream) 
            throw Error("No MediaStream provided to SpeechProcessor.");
        const track = this.stream?.getAudioTracks()?.[0];
        if (!track)
            throw Error("MediaStream does not have any audio tracks.");

        if (!MediaRecorder.isTypeSupported('audio/webm'))
            throw Error("This browser is not supported for speceh.");

        // Voice activity detector
        const audio = new AudioContext({ sampleRate: 48000 });
        const analyzer = audio.createAnalyser();
        const mic = audio.createMediaStreamSource(this.stream);
        const proc = audio.createScriptProcessor(this.filterSize, 1, 1);
        analyzer.fftSize = this.filterSize;

        mic.connect(analyzer);
        analyzer.connect(proc);
        proc.connect(audio.destination);

        this.disconnect = () => {
            try {
                analyzer.disconnect();
                mic.disconnect();
                proc.disconnect();
            }
            catch (e) {
                console.error("Failed to disconnect speech processors.");
                console.error(e);
                this.dispatchEvent(new SpeechProcessorError(e));
            }
        };

        // Low pass filtered/smoothened volume
        let smooth: number = 0.0;

        // Active transcription request
        let active: boolean = false;

        // Live audio buffer - keeping track of previous chunks in case the detection is al ittle late
        const memory = new FIFO<AudioBuffer>(this.bufferSize);

        // A new socket is opened for each transcription (in case there was an error from before, but also to clearly separate utterances)
        let txc: Transcription;
        // Once finish processing the transcription, we'll delete it to preserve memory.
        
        // Configure audio analyzer for voice activity detection
        proc.onaudioprocess = (e) => {
            if (!this.enabled || this.muted) return;

            const data = e.inputBuffer.getChannelData(0);
            
            const volume = data.reduce((c,x) => c + Math.abs(x), 0) / data.length;
            
            // Apply low pass filter to smooth the input
            smooth = Math.min(smooth * (1 - this.filterGain) + volume * this.filterGain, this.filterLimit);

            this.dispatchEvent(new VolumeEvent(volume, smooth));

            // Save data to memory for future transmission
            // Clone input buffer
            const ibuf = new AudioBuffer({
                length: e.inputBuffer.length,
                numberOfChannels: 1,
                sampleRate: e.inputBuffer.sampleRate
            });
            ibuf.copyToChannel(data, 0);
            memory.in(ibuf);

            // If active, send data for transcription
            if (active) {
                txc?.send(ibuf);
                
                // If the filtered amplitude drops below the die down threshold, deactivate
                if (smooth < this.diedown) {
                    active = false;

                    // Close the transcription
                    txc?.close();

                    this.dispatchEvent(new VoiceOffEvent());
                }
            }

            // If not activated but the raw value crosses the trigger threshold, activate!
            else if (volume > this.trigger) {
                active = true;

                // Set filter to raw (trigger) value to prevent premature cut-off
                smooth = Math.min(volume, this.filterLimit);

                // Start synchronously in case it takes a bit to establish a websocket connection
                (async () => {
                    // Override-able virtual function to make sure we should be transcribing this event
                    if (! await this.shouldTranscribe())
                        return;

                    // Create a transcription record
                    txc = new Transcription(this.token, {
                        encoding: "linear16",
                        sampleRate: 48000,
                        vadTurnoff: 1000,
                        // logging: true,
                        minDuration: this.minDuration,
                        minCharacters: this.minCharacters,
                        keywords: this.keywords
                    });

                    this.dispatchEvent(new VoiceOnEvent(txc));

                    // Forward transcription event
                    txc.addEventListener('transcript', (e) => {
                        const t = e as TranscriptionEvent;
                        this.dispatchEvent(new TranscriptionEvent(t.parent));

                        // We can force-close the transcription and delete the record to prevent any memory leaks
                        t.parent?.close(true);
                    });

                    this.transcriptions.push(txc);
                    // Transmit all chunks currently in memory (to get some past data)
                    txc.send(...memory.all());
                })();
            }
        };
    }

    /** Stop speech processing and clean up */
    stop = () => {
        // Forcibly close all transcriptions
        this.transcriptions.forEach(t => t.close(true));
        this.disconnect?.();
    }

    /** Event listeners */
    addEventListener = <K extends keyof SpeechProcessorEvents>(type: K, callback: (event: SpeechProcessorEvents[K]) => void) => {
        super.addEventListener(type, callback as EventListener | EventListenerObject);
    }

    removeEventListener = <K extends keyof SpeechProcessorEvents>(type: K, callback: (event: SpeechProcessorEvents[K]) => void) => {
        super.removeEventListener(type, callback as EventListener | EventListenerObject);
    }

    dispatchEvent = (event: TranscriptionEvent | SpeechProcessorError | VoiceOnEvent | VoiceOffEvent | VolumeEvent) => {
        return super.dispatchEvent(event);
    }

    /** Set transcription check function to be overridden */
    setShouldTranscribeFunction = (f: () => Promise<boolean>) => {
        this.shouldTranscribe = f;
    }
}