import { regexpMatchers } from "../editor/utils/mark/regexpMatchers";

export type Token = {
  type: "text" | "quote" | "hashtag";
  text: string;
  stem: string | null;
  stopword: boolean;
};

const WHITESPACE = /\s/;
const QUOTES = /["“”]/;

/**
 * Split a string into tokens.
 *
 * Examples:
 * ```
 * tokenize("foo bar");
 * // => [{ type: "text", string: "foo" }, { type: "text", string: "bar" }]
 * tokenize('"foo bar" baz');
 * // => [{ type: "quote", string: "foo bar" }, { type: "text", string: "baz" }]
 * ```
 */
export function tokenize(text: string): Token[] {
  const textTrimed = text.trim();
  if (textTrimed === "") return [];

  const tokens: Token[] = [];
  function addToken(substring: string) {
    if (substring === "") return;
    const last = substring[substring.length - 1];
    if (QUOTES.test(last)) {
      tokens.push({
        type: "quote",
        text: substring.slice(1, -1),
        stem: null,
        stopword: false,
      });
    } else if (substring.match(regexpMatchers.hashtag)) {
      tokens.push({
        type: "hashtag",
        text: substring,
        stem: null,
        stopword: false,
      });
    } else {
      tokens.push({
        type: "text",
        text: substring,
        stem: stemmer(substring),
        stopword: isStopword(substring),
      });
    }
  }

  let tokenStart = 0;
  let tokenEnd = 0;
  while (tokenEnd < textTrimed.length) {
    const char = textTrimed[tokenEnd];
    if (WHITESPACE.test(char)) {
      addToken(textTrimed.substring(tokenStart, tokenEnd));
      tokenStart = tokenEnd + 1;
      tokenEnd = tokenStart;
    } else if (QUOTES.test(char)) {
      // Find end of quoted text
      const quoteStart = tokenEnd;
      let quoteEnd: number | null = null;
      for (let end = quoteStart + 1; end < textTrimed.length; end++) {
        if (QUOTES.test(textTrimed[end])) {
          quoteEnd = end + 1;
          break;
        }
      }
      if (quoteEnd !== null) {
        // Found quoted text
        addToken(textTrimed.substring(tokenStart, tokenEnd));
        addToken(textTrimed.substring(quoteStart, quoteEnd));
        tokenStart = quoteEnd + 1;
        tokenEnd = tokenStart;
      } else {
        // No end quote found
        addToken(textTrimed.substring(tokenStart, tokenEnd));
        tokenStart = tokenEnd + 1; // Skip the quote character
        tokenEnd = tokenStart;
      }
    } else {
      tokenEnd++;
    }
  }
  addToken(textTrimed.slice(tokenStart, tokenEnd));
  return tokens;
}

export function stemmer(word: string) {
  let stem = word;
  // remove inflection endings
  const regexps = [
    /n't$/i,
    /['"](ll|s|ve|re|d)$/i,
    /s$/i,
    /ly$/i,
    /y$/i,
    /ie$/i,
    /e$/i,
    /er$/i,
    /ed$/i,
    /ing$/i,
    /ation$/i,
    /tion$/i,
  ];
  for (const regexp of regexps) {
    if (stem.length <= 3) {
      break;
    }
    stem = stem.replace(regexp, "");
  }
  // remove duplicate letters at end of word
  const lastChar = stem.slice(-1)[0];
  if (lastChar.match(/[a-z]/)) {
    stem = stem.replace(new RegExp(`${lastChar}+$`), lastChar);
  }
  return stem;
}

/**
 * Stopwords to filter out of search queries.
 *
 * Taken from NLTK stopwords list, with some modifications,
 * like adding common contractions which are our {@link stemmer} function
 * doesn't handle well (e.g. it stems "won't" into "wo")
 *
 * @see https://gist.github.com/sebleier/554280
 * @see https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
 */
export const stopwords = new Set([
  "i",
  "me",
  "my",
  "myself",
  "we",
  "our",
  "ours",
  "ourselves",
  "you",
  "your",
  "yours",
  "yourself",
  "yourselves",
  "he",
  "him",
  "his",
  "himself",
  "she",
  "her",
  "hers",
  "herself",
  "it",
  "its",
  "itself",
  "they",
  "them",
  "their",
  "theirs",
  "themselves",
  "what",
  "which",
  "who",
  "whom",
  "this",
  "that",
  "these",
  "those",
  "am",
  "is",
  "are",
  "was",
  "were",
  "be",
  "been",
  "being",
  "have",
  "has",
  "had",
  "having",
  "do",
  "does",
  "did",
  "doing",
  "a",
  "an",
  "the",
  "and",
  "but",
  "if",
  "or",
  "because",
  "as",
  "until",
  "while",
  "of",
  "at",
  "by",
  "for",
  "with",
  "about",
  "against",
  "between",
  "into",
  "through",
  "during",
  "before",
  "after",
  "above",
  "below",
  "to",
  "from",
  "up",
  "down",
  "in",
  "out",
  "on",
  "off",
  "over",
  "under",
  "again",
  "further",
  "then",
  "once",
  "here",
  "there",
  "when",
  "where",
  "why",
  "how",
  "all",
  "any",
  "both",
  "each",
  "few",
  "more",
  "most",
  "other",
  "some",
  "such",
  "no",
  "nor",
  "not",
  "only",
  "own",
  "same",
  "so",
  "than",
  "too",
  "very",
  "s",
  "t",
  "can",
  "can't",
  "will",
  "won't",
  "just",
  "don",
  "should",
  "now",
]);

function isStopword(word: string) {
  return stopwords.has(word);
}
