// Grabbed from Whisper code base
// https://github.com/openai/whisper/blob/ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab/whisper/tokenizer.py#L10
export const TRANSCRIPTION_LANGUAGES = {
  en: "English",
  zh: "Chinese (Mandarin)",
  de: "German",
  es: "Spanish",
  ru: "Russian",
  ko: "Korean",
  fr: "French",
  ja: "Japanese",
  pt: "Portuguese",
  tr: "Turkish",
  pl: "Polish",
  ca: "Catalan",
  nl: "Dutch",
  ar: "Arabic",
  sv: "Swedish",
  it: "Italian",
  id: "Indonesian",
  hi: "Hindi",
  fi: "Finnish",
  vi: "Vietnamese",
  he: "Hebrew",
  uk: "Ukrainian",
  el: "Greek",
  ms: "Malay",
  cs: "Czech",
  ro: "Romanian",
  da: "Danish",
  hu: "Hungarian",
  ta: "Tamil",
  no: "Norwegian",
  th: "Thai",
  ur: "Urdu",
  hr: "Croatian",
  bg: "Bulgarian",
  lt: "Lithuanian",
  la: "Latin",
  mi: "Maori",
  ml: "Malayalam",
  cy: "Welsh",
  sk: "Slovak",
  te: "Telugu",
  fa: "Persian",
  lv: "Latvian",
  bn: "Bengali",
  sr: "Serbian",
  az: "Azerbaijani",
  sl: "Slovenian",
  kn: "Kannada",
  et: "Estonian",
  mk: "Macedonian",
  br: "Breton",
  eu: "Basque",
  is: "Icelandic",
  hy: "Armenian",
  ne: "Nepali",
  mn: "Mongolian",
  bs: "Bosnian",
  kk: "Kazakh",
  sq: "Albanian",
  sw: "Swahili",
  gl: "Galician",
  mr: "Marathi",
  pa: "Punjabi",
  si: "Sinhala",
  km: "Khmer",
  sn: "Shona",
  yo: "Yoruba",
  so: "Somali",
  af: "Afrikaans",
  oc: "Occitan",
  ka: "Georgian",
  be: "Belarusian",
  tg: "Tajik",
  sd: "Sindhi",
  gu: "Gujarati",
  am: "Amharic",
  yi: "Yiddish",
  lo: "Lao",
  uz: "Uzbek",
  fo: "Faroese",
  ht: "Haitian Creole",
  ps: "Pashto",
  tk: "Turkmen",
  nn: "Nynorsk",
  mt: "Maltese",
  sa: "Sanskrit",
  lb: "Luxembourgish",
  my: "Myanmar",
  bo: "Tibetan",
  tl: "Tagalog",
  mg: "Malagasy",
  as: "Assamese",
  tt: "Tatar",
  haw: "Hawaiian",
  ln: "Lingala",
  ha: "Hausa",
  ba: "Bashkir",
  jw: "Javanese",
  su: "Sundanese",
  yue: "Cantonese",
};

/** This function does all the find-and-replace logic we need on the autogenerated transcripts */
export function processTranscriptReplacements(transcript: string): string {
  const replaced = transcript
    // Full text replacements
    .replaceAll(
      /(hashtag|hash-tag|hash tag|#)[\s,]+([\p{Alpha}_0-9\x2d—+*]*)/gi,
      (_substring, hashtag, content) => "#" + content,
    )
    .replaceAll(/(at sign|atsign|@)[\s,]+([\p{Alpha}_0-9\x2d—+*]*)/gi, (_substring, atsign, content) => "#@" + content)
    .replaceAll(/(tilde|~)[\s,]+([\p{Alpha}_0-9\x2d—+*]*)/gi, (_substring, tilde, content) => "#~" + content)
    .replaceAll(
      /([\p{Alpha}_0-9\x2d—+*]*)\s+(no space|nospace)\s+([\p{Alpha}_0-9\x2d—+*]*)/gi,
      (_substring, word1, space, word2) => word1 + word2,
    )
    .replaceAll(
      /([\p{Alpha}_0-9\x2d—+*]*)\s+(underscore)\s+([\p{Alpha}_0-9\x2d—+*]*)/gi,
      (_substring, word1, space, word2) => word1 + "_" + word2,
    )
    // Handle "#todo", "#to do"
    .replaceAll(/hashtag2do/gi, "#todo")
    .replaceAll(/#2\s*do/gi, "#todo")
    .replaceAll(/#to\s+do/gi, "#todo")
    .replaceAll(/#to-do/gi, "#todo")
    .replaceAll(/#todo(\s?)/gi, (_substring, optWhitespace) => "[] #todo" + optWhitespace)
    // Handle "new paragraph", "new para"
    .replaceAll(/new paragraph/gi, (_substring) => "\n\n")
    .replaceAll(/(new para|newpara)/gi, (_substring) => "\n\n")
    // "New line", "newline", etc
    .replaceAll(/new line/gi, (_substring) => "newline")
    .replaceAll(
      /[,\s]{0,2}newline[,\s.]{0,2}((tab[,\s.]{0,2})*)\s?(dash|bullet)[,\s.]*/gi,
      (_substring, tabs, tab, dash) => "\n" + "\t".repeat((tabs.match(/tab/g) || []).length) + "-",
    )
    .replaceAll(/newline[\s,.]{0,2}/gi, (_substring) => "\n")
    .replaceAll(/[,\s]{0,2}colon[,\s.(space)]*/gi, (_substring) => ": ")
    .split("\n")
    // Line-by-line replacements
    .map((line) => {
      if (line.match(/\bto\s?do\b/gi) && !line.match(/\[(\s|x)?\]/gi)) {
        line = line.replace(/^\s*/, "[] ");
      }
      return line;
    })
    .join("\n")
    // Handle "new entry"
    .replaceAll(/new entry/gi, (_substring) => "\n--\n")
    // Prep for "top entry" replacement by normalizing variations
    .replaceAll(/top entry/gi, "**topentry**")
    .replaceAll(/top-entry/gi, "**topentry**");

  // Actually handle "top entry" replacements with the associated shuffling logic.
  let result = "";
  for (const chunk of replaced.split("**topentry**")) {
    // Each time "top entry" is hit, subsequent text should be prepended instead of appended
    result = chunk + "\n--\n" + result;
  }
  result = result.slice(0, -5); // Remove the very last newlines and dashes

  // Go through and clean up stray punctuation potentially introduced by "new entry" and "top entry" commands (ENT-2250)
  const entries = result.split("\n--\n");
  for (let i = 0; i < entries.length; i++) {
    // Clean up start of entry
    entries[i] = entries[i].replace(/^[\s.,]+/, "");
    // Clean up end of entry
    entries[i] = entries[i].replace(/[.,]+$/, ".");
    // Trim white space while we're here
    entries[i] = entries[i].trim();
  }
  result = entries.join("\n--\n");

  return result;
}
