EnergyNewsKeyword/nlp/dictionary.py

102 lines
3.1 KiB
Python
Raw Permalink Normal View History

2023-04-19 13:16:27 +08:00
from collections import OrderedDict
class Dictionary(object):
"""Dictionary utility class. This class is a lightweight version of the Keras text preprocessing module
(see https://github.com/fchollet/keras/blob/master/keras/preprocessing/text.py), designed to work on
tokens instead of strings.
This class is used to build a dictionary that can in turn be used to fill an Embedding layer
with word embeddings.
Please note that `0` is a reserved index that won't be assigned to any word.
The original keras.preprocessing.text module is licensed under the MIT license.
"""
def __init__(self, num_words=None):
self.word_counts = OrderedDict()
self.word_index = {}
self.reverse_word_index = None
self.num_words = num_words
self.document_count = 0
def fit_on_texts(self, tokenized_documents):
for document in tokenized_documents:
self.document_count += 1
for w in document:
if w in self.word_counts:
self.word_counts[w] += 1
else:
self.word_counts[w] = 1
wcounts = list(self.word_counts.items())
wcounts.sort(key=lambda x: x[1], reverse=True)
sorted_voc = [wc[0] for wc in wcounts]
# note that index 0 is reserved, never assigned to an existing word
self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
def texts_to_sequences(self, texts):
"""
Transforms each text in texts in a sequence of integers.
Only top "num_words" most frequent words will be taken into account.
:param texts: A list of words
:return: A list of sequences.
"""
texts_sequences = []
for text in texts:
texts_sequences.append(self.token_list_to_sequence(text))
return texts_sequences
def token_list_to_sequence(self, tokens):
"""Transforms each text in texts in a sequence of integers.
Only top "num_words" most frequent words will be taken into account.
Only words known by the tokenizer will be taken into account.
# Arguments
tokens: A list of texts (strings).
# Yields
Yields individual sequences.
"""
vect = []
for w in tokens:
i = self.word_index.get(w)
if i is not None:
if self.num_words and i >= self.num_words:
continue
else:
vect.append(i)
return vect
def tokens_to_words(self, tokens):
"""
Utility that prints the words associated to the provided indices.
:param tokens: a list of integers
"""
if not self.reverse_word_index:
self.build_reverse_word_index()
words = []
for token in tokens:
if token != 0:
words.append(self.reverse_word_index[token])
return words
def build_reverse_word_index(self):
self.reverse_word_index = {}
for key, value in self.word_index.items():
self.reverse_word_index[value] = key