import * as SpeechSDK from "microsoft-cognitiveservices-speech-sdk";
import { getToken } from "./utils";
import Synthesizer from "../Synthesizer";
import { Track } from "../track/Track";
import RemoteAudioTrack from "../track/RemoteAudioTrack";

export default class SpeechSynthesizer extends Synthesizer {
  constructor() {
    super();
    this.synthesizer = null;
    this.lang = "tr-TR";
    this.voice = null;
    this.audioSource = null;
    this.audioContext = null;
    this.destination = null;
  }

  async start() {
    if (this.synthesizer) {
      return;
    }

    const { token, region } = await getToken();

    this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
      sampleRate: 16000,
    });
    this.destination = this.audioContext.createMediaStreamDestination();
    const track = this.destination.stream.getAudioTracks()[0];
    this.setTrack(track);

    this.speechConfig = SpeechSDK.SpeechConfig.fromAuthorizationToken(
      token,
      region
    );

    this.synthesizer = new SpeechSDK.SpeechSynthesizer(this.speechConfig, null);
    this.synthesizer.synthesisStarted = (s, e) => {};
    this.synthesizer.synthesizing = (s, e) => {
      this.setIsSynthesizing(true);
    };
    this.synthesizer.synthesisCompleted = (s, e) => {
      this.setIsSynthesizing(false);
    };
    this.synthesizer.SynthesisCanceled = (s, e) => {
      this.setIsSynthesizing(false);
    };

    this.emit("connected");
  }

  async stop() {
    this.emit("disconnected");

    if (this.synthesizer) {
      this.synthesizer.close();
      this.synthesizer = null;
    }

    if (this.audioSource) {
      this.audioSource.stop();
      this.audioSource.disconnect();
      this.audioSource = null;
    }

    if (this.audioContext) {
      this.audioContext.close();
      this.audioContext = null;
    }

    if (this.destination) {
      this.destination = null;
    }

    this.removeTrack(Track.Kind.Audio);

    this.speechQueue = [];
    this.lastSpokeAt = null;
    this.isSpeaking = false;
    this.lang = "tr-TR";
    this.voice = null;
  }

  speak(speechData) {
    const {
      text,
      voice = this.voice,
      lang = this.lang || "tr-TR",
    } = speechData;

    let ssml = SpeechSDK.Synthesizer.XMLEncode(text);
    if (voice) {
      ssml = `<voice name='${voice}'>${ssml}</voice>`;
    }
    ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xmlns:emo='http://www.w3.org/2009/10/emotionml' xml:lang='${lang}'>${ssml}</speak>`;

    this.synthesizer.speakSsmlAsync(
      ssml,
      e => {
        if (e.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) {
          console.log(
            "[AI-SPEAKING-COMPLETE][" + new Date().toISOString() + "]",
            speechData.text
          );
          this.speechQueue.push({ audioData: e.audioData, speechData });
          this.processQueue();
        } else {
          console.error("Speech synthesis failed: " + e.errorDetails);
        }
      },
      error => {
        console.error("Speech synthesis failed: " + error);
      }
    );
  }

  async stopSpeaking() {
    if (this.noInterrupt) {
      return Promise.resolve([]);
    }

    const skippedTextIds = this.speechQueue
      .map(s => s?.speechData?.id)
      .filter(id => id);

    const lastSentence = this.speechQueue.pop();

    if (lastSentence) {
      console.log(
        "[AI-SPEAKING-STOPING][" + new Date().toISOString() + "]",
        lastSentence?.speechData?.text
      );
    }

    this.speechQueue = [];
    if (this.audioSource) {
      this.audioSource.stop();
      this.audioSource.disconnect();
      this.audioSource = null;
    }

    return Promise.resolve(skippedTextIds);
  }

  processQueue() {
    if (this.speechQueue.length === 0) {
      return;
    }

    if (this.audioSource) {
      return;
    }

    const { audioData, speechData } = this.speechQueue.shift();

    this.setIsSpeaking(true);
    this.audioContext.decodeAudioData(audioData, buffer => {
      this.audioSource = this.audioContext.createBufferSource();
      this.audioSource.connect(this.destination);
      this.audioSource.buffer = buffer;
      this.audioSource.onended = () => {
        this.audioSource = null;
        if (this.speechQueue.length > 0) {
          this.processQueue();
        } else {
          this.setIsSpeaking(false);
          this.emit("speechCompleted", speechData);
        }
      };
      this.audioSource.start();
    });
  }

  setTrack(track) {
    const mediaTrack = new RemoteAudioTrack(
      track,
      Track.Kind.Audio,
      undefined,
      this.audioContext
    );
    mediaTrack.setMediaStream(this.destination.stream);
    super.setTrack(mediaTrack);
  }
}
