from collections import OrderedDict class Dictionary(object): """Dictionary utility class. This class is a lightweight version of the Keras text preprocessing module (see https://github.com/fchollet/keras/blob/master/keras/preprocessing/text.py), designed to work on tokens instead of strings. This class is used to build a dictionary that can in turn be used to fill an Embedding layer with word embeddings. Please note that `0` is a reserved index that won't be assigned to any word. The original keras.preprocessing.text module is licensed under the MIT license. """ def __init__(self, num_words=None): self.word_counts = OrderedDict() self.word_index = {} self.reverse_word_index = None self.num_words = num_words self.document_count = 0 def fit_on_texts(self, tokenized_documents): for document in tokenized_documents: self.document_count += 1 for w in document: if w in self.word_counts: self.word_counts[w] += 1 else: self.word_counts[w] = 1 wcounts = list(self.word_counts.items()) wcounts.sort(key=lambda x: x[1], reverse=True) sorted_voc = [wc[0] for wc in wcounts] # note that index 0 is reserved, never assigned to an existing word self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))) def texts_to_sequences(self, texts): """ Transforms each text in texts in a sequence of integers. Only top "num_words" most frequent words will be taken into account. :param texts: A list of words :return: A list of sequences. """ texts_sequences = [] for text in texts: texts_sequences.append(self.token_list_to_sequence(text)) return texts_sequences def token_list_to_sequence(self, tokens): """Transforms each text in texts in a sequence of integers. Only top "num_words" most frequent words will be taken into account. Only words known by the tokenizer will be taken into account. # Arguments tokens: A list of texts (strings). # Yields Yields individual sequences. """ vect = [] for w in tokens: i = self.word_index.get(w) if i is not None: if self.num_words and i >= self.num_words: continue else: vect.append(i) return vect def tokens_to_words(self, tokens): """ Utility that prints the words associated to the provided indices. :param tokens: a list of integers """ if not self.reverse_word_index: self.build_reverse_word_index() words = [] for token in tokens: if token != 0: words.append(self.reverse_word_index[token]) return words def build_reverse_word_index(self): self.reverse_word_index = {} for key, value in self.word_index.items(): self.reverse_word_index[value] = key