from collections import OrderedDict


class Dictionary(object):
    """Dictionary utility class. This class is a lightweight version of the Keras text preprocessing module
    (see https://github.com/fchollet/keras/blob/master/keras/preprocessing/text.py), designed to work on
    tokens instead of strings.

    This class is used to build a dictionary that can in turn be used to fill an Embedding layer
    with word embeddings.

    Please note that `0` is a reserved index that won't be assigned to any word.
    
    The original keras.preprocessing.text module is licensed under the MIT license.
    """

    def __init__(self, num_words=None):

        self.word_counts = OrderedDict()
        self.word_index = {}
        self.reverse_word_index = None
        self.num_words = num_words
        self.document_count = 0

    def fit_on_texts(self, tokenized_documents):

        for document in tokenized_documents:
            self.document_count += 1

            for w in document:
                if w in self.word_counts:
                    self.word_counts[w] += 1
                else:
                    self.word_counts[w] = 1

        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)
        sorted_voc = [wc[0] for wc in wcounts]
        # note that index 0 is reserved, never assigned to an existing word
        self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))

    def texts_to_sequences(self, texts):
        """
        Transforms each text in texts in a sequence of integers.

        Only top "num_words" most frequent words will be taken into account.

        :param texts: A list of words
        :return: A list of sequences.
        """
        texts_sequences = []
        for text in texts:
            texts_sequences.append(self.token_list_to_sequence(text))
        return texts_sequences

    def token_list_to_sequence(self, tokens):
        """Transforms each text in texts in a sequence of integers.

        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            tokens: A list of texts (strings).

        # Yields
            Yields individual sequences.
        """
        vect = []
        for w in tokens:

                i = self.word_index.get(w)
                if i is not None:
                    if self.num_words and i >= self.num_words:
                        continue
                    else:
                        vect.append(i)
        return vect

    def tokens_to_words(self, tokens):
        """
        Utility that prints the words associated to the provided indices.

        :param tokens: a list of integers
        """

        if not self.reverse_word_index:
            self.build_reverse_word_index()

        words = []

        for token in tokens:
            if token != 0:
                words.append(self.reverse_word_index[token])

        return words

    def build_reverse_word_index(self):

        self.reverse_word_index = {}
        for key, value in self.word_index.items():
            self.reverse_word_index[value] = key