560 lines
24 KiB
Python
560 lines
24 KiB
Python
|
from keras.preprocessing.sequence import pad_sequences
|
|||
|
from keras.utils import np_utils
|
|||
|
from utils import glove
|
|||
|
from nlp import dictionary as dict
|
|||
|
import logging
|
|||
|
import numpy as np
|
|||
|
import random
|
|||
|
|
|||
|
|
|||
|
def prepare_answer(train_doc, train_answer, train_candidates,
|
|||
|
test_doc, test_answer, test_candidates,
|
|||
|
val_doc=None, val_answer=None, val_candidates=None,
|
|||
|
max_document_length=1000,
|
|||
|
max_answer_length=20,
|
|||
|
max_vocabulary_size=50000,
|
|||
|
embeddings_size=50):
|
|||
|
"""
|
|||
|
Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
|
|||
|
previously for the training, test and validation sets as candidate for all three sets.
|
|||
|
|
|||
|
:param train_doc: the training documents
|
|||
|
:param train_answer: the KPs for the training documents
|
|||
|
:param train_candidates: the candidate KPs for the training documents
|
|||
|
:param test_doc: the test documents
|
|||
|
:param test_answer: the KPs for the test documents
|
|||
|
:param test_candidates: the candidate KPs for the test documents
|
|||
|
:param val_doc: the validation documents (can be None)
|
|||
|
:param val_answer: the KPs for the validation documents (can be None)
|
|||
|
:param val_candidates: the candidate KPs for the validation documents (can be None)
|
|||
|
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
|
|||
|
:param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
|
|||
|
:param max_vocabulary_size: the maximum size of the vocabulary to use
|
|||
|
(i.e. we keep only the top max_vocabulary_size words)
|
|||
|
:param embeddings_size: the size of the GLoVE embeddings to use
|
|||
|
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
|
|||
|
test and validation set, and an embedding matrix for an Embedding layer
|
|||
|
"""
|
|||
|
|
|||
|
# Prepare validation return data
|
|||
|
val_q = None
|
|||
|
val_a = None
|
|||
|
val_y = None
|
|||
|
|
|||
|
# Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
|
|||
|
train_q = []
|
|||
|
test_q = []
|
|||
|
train_a = []
|
|||
|
test_a = []
|
|||
|
train_y = []
|
|||
|
test_y = []
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_q = []
|
|||
|
val_a = []
|
|||
|
val_y = []
|
|||
|
|
|||
|
documents_full = []
|
|||
|
for key, doc in train_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
for key, doc in test_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
for key, doc in val_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
|
|||
|
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
|
|||
|
|
|||
|
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
|
|||
|
dictionary.fit_on_texts(documents_full)
|
|||
|
|
|||
|
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
|
|||
|
|
|||
|
# Pair up each document with a candidate keyphrase and its truth value
|
|||
|
for key, document in train_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
for kp in train_candidates[key]:
|
|||
|
train_q.append(doc_sequence)
|
|||
|
train_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
train_y.append([0, 1] if kp in train_answer[key] else [1, 0])
|
|||
|
|
|||
|
for key, document in test_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
for kp in test_candidates[key]:
|
|||
|
test_q.append(doc_sequence)
|
|||
|
test_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
for key, document in val_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
for kp in val_candidates[key]:
|
|||
|
val_q.append(doc_sequence)
|
|||
|
val_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
|
|||
|
|
|||
|
logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len)))
|
|||
|
logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len)))
|
|||
|
logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len)))
|
|||
|
logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len)))
|
|||
|
if val_doc and val_answer:
|
|||
|
logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
|
|||
|
logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len)))
|
|||
|
|
|||
|
train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
|
|||
|
test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
|
|||
|
logging.debug("Training set documents size : %s", np.shape(train_q))
|
|||
|
logging.debug("Training set answers size : %s", np.shape(train_a))
|
|||
|
logging.debug("Test set documents size : %s", np.shape(test_q))
|
|||
|
logging.debug("Test set answers size : %s ", np.shape(test_a))
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
logging.debug("Validation set documents size : %s", np.shape(val_q))
|
|||
|
logging.debug("Validation set answers size : %s ", np.shape(val_a))
|
|||
|
|
|||
|
# prepare the matrix for the embedding layer
|
|||
|
word_index = dictionary.word_index
|
|||
|
embeddings_index = glove.load_glove('', embeddings_size)
|
|||
|
|
|||
|
num_words = min(max_vocabulary_size, 1 + len(word_index))
|
|||
|
|
|||
|
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
|
|||
|
|
|||
|
embedding_matrix = np.zeros((num_words, embeddings_size))
|
|||
|
for word, i in word_index.items():
|
|||
|
if i >= num_words:
|
|||
|
continue
|
|||
|
embedding_vector = embeddings_index.get(word)
|
|||
|
if embedding_vector is not None:
|
|||
|
# words not found in embedding index will be all-zeros.
|
|||
|
embedding_matrix[i] = embedding_vector
|
|||
|
|
|||
|
return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, embedding_matrix, dictionary
|
|||
|
|
|||
|
|
|||
|
def prepare_answer_2(train_doc, train_answer, train_candidates,
|
|||
|
test_doc, test_answer, test_candidates,
|
|||
|
val_doc=None, val_answer=None, val_candidates=None,
|
|||
|
max_document_length=1000,
|
|||
|
max_answer_length=20,
|
|||
|
max_vocabulary_size=50000,
|
|||
|
embeddings_size=50):
|
|||
|
"""
|
|||
|
Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
|
|||
|
previously for the test and validation sets as candidate for these sets, and mix the correct answers with
|
|||
|
wrong patterns on the training set to build in order to have balanced data for training.
|
|||
|
|
|||
|
:param train_doc: the training documents
|
|||
|
:param train_answer: the KPs for the training documents
|
|||
|
:param train_candidates: the candidate KPs for the training documents
|
|||
|
:param test_doc: the test documents
|
|||
|
:param test_answer: the KPs for the test documents
|
|||
|
:param test_candidates: the candidate KPs for the test documents
|
|||
|
:param val_doc: the validation documents (can be None)
|
|||
|
:param val_answer: the KPs for the validation documents (can be None)
|
|||
|
:param val_candidates: the candidate KPs for the validation documents (can be None)
|
|||
|
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
|
|||
|
:param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
|
|||
|
:param max_vocabulary_size: the maximum size of the vocabulary to use
|
|||
|
(i.e. we keep only the top max_vocabulary_size words)
|
|||
|
:param embeddings_size: the size of the GLoVE embeddings to use
|
|||
|
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
|
|||
|
test and validation set, and an embedding matrix for an Embedding layer
|
|||
|
"""
|
|||
|
|
|||
|
# Prepare validation return data
|
|||
|
val_q = None
|
|||
|
val_a = None
|
|||
|
val_y = None
|
|||
|
|
|||
|
val_q_balanced = None
|
|||
|
val_a_balanced = None
|
|||
|
val_y_balanced = None
|
|||
|
|
|||
|
# Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
|
|||
|
train_q = []
|
|||
|
test_q = []
|
|||
|
train_a = []
|
|||
|
test_a = []
|
|||
|
train_y = []
|
|||
|
test_y = []
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_q = []
|
|||
|
val_a = []
|
|||
|
val_y = []
|
|||
|
val_q_balanced = []
|
|||
|
val_a_balanced = []
|
|||
|
val_y_balanced = []
|
|||
|
|
|||
|
documents_full = []
|
|||
|
for key, doc in train_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
for key, doc in test_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
for key, doc in val_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
|
|||
|
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
|
|||
|
|
|||
|
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
|
|||
|
dictionary.fit_on_texts(documents_full)
|
|||
|
|
|||
|
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
|
|||
|
|
|||
|
# Pair up each document with a candidate keyphrase and its truth value
|
|||
|
for key, document in train_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
|
|||
|
# select wrong candidates (possibly, in same quantity as good answers)
|
|||
|
wrong_candidates = list(train_candidates[key])
|
|||
|
for answer in train_answer[key]:
|
|||
|
if answer in wrong_candidates:
|
|||
|
wrong_candidates.remove(answer)
|
|||
|
|
|||
|
while len(wrong_candidates) > len(train_answer[key]):
|
|||
|
random_candidate = random.choice(wrong_candidates)
|
|||
|
wrong_candidates.remove(random_candidate)
|
|||
|
|
|||
|
# append wrong candidates
|
|||
|
for kp in wrong_candidates:
|
|||
|
train_q.append(doc_sequence)
|
|||
|
train_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
train_y.append([1, 0])
|
|||
|
|
|||
|
# append true answers
|
|||
|
for kp in train_answer[key]:
|
|||
|
train_q.append(doc_sequence)
|
|||
|
train_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
train_y.append([0, 1])
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
for key, document in val_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
|
|||
|
# select wrong candidates (possibly, in same quantity as good answers)
|
|||
|
wrong_candidates = list(val_candidates[key])
|
|||
|
for answer in val_answer[key]:
|
|||
|
if answer in wrong_candidates:
|
|||
|
wrong_candidates.remove(answer)
|
|||
|
|
|||
|
while len(wrong_candidates) > len(val_answer[key]):
|
|||
|
random_candidate = random.choice(wrong_candidates)
|
|||
|
wrong_candidates.remove(random_candidate)
|
|||
|
|
|||
|
# append wrong candidates
|
|||
|
for kp in wrong_candidates:
|
|||
|
val_q_balanced.append(doc_sequence)
|
|||
|
val_a_balanced.append(dictionary.token_list_to_sequence(kp))
|
|||
|
val_y_balanced.append([1, 0])
|
|||
|
|
|||
|
# append true answers
|
|||
|
for kp in val_answer[key]:
|
|||
|
val_q_balanced.append(doc_sequence)
|
|||
|
val_a_balanced.append(dictionary.token_list_to_sequence(kp))
|
|||
|
val_y_balanced.append([0, 1])
|
|||
|
|
|||
|
# for the other sets, just pick the auto-generated candidates
|
|||
|
for key, document in test_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
for kp in test_candidates[key]:
|
|||
|
test_q.append(doc_sequence)
|
|||
|
test_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
for key, document in val_doc.items():
|
|||
|
doc_sequence = dictionary.token_list_to_sequence(document)
|
|||
|
for kp in val_candidates[key]:
|
|||
|
val_q.append(doc_sequence)
|
|||
|
val_a.append(dictionary.token_list_to_sequence(kp))
|
|||
|
val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
|
|||
|
|
|||
|
logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len)))
|
|||
|
logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len)))
|
|||
|
logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len)))
|
|||
|
logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len)))
|
|||
|
if val_doc and val_answer:
|
|||
|
logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
|
|||
|
logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len)))
|
|||
|
logging.debug("Longest balanced validation document : %s tokens" % len(max(val_q, key=len)))
|
|||
|
logging.debug("Longest balanced validation answer : %s tokens" % len(max(val_a, key=len)))
|
|||
|
|
|||
|
train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
|
|||
|
test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
val_q_balanced = np.asarray(pad_sequences(val_q_balanced, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
val_a_balanced = np.asarray(pad_sequences(val_a_balanced, maxlen=max_answer_length, padding='post', truncating='post'))
|
|||
|
|
|||
|
logging.debug("Training set documents size : %s", np.shape(train_q))
|
|||
|
logging.debug("Training set answers size : %s", np.shape(train_a))
|
|||
|
logging.debug("Test set documents size : %s", np.shape(test_q))
|
|||
|
logging.debug("Test set answers size : %s ", np.shape(test_a))
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
logging.debug("Validation set documents size : %s", np.shape(val_q))
|
|||
|
logging.debug("Validation set answers size : %s ", np.shape(val_a))
|
|||
|
logging.debug("Balanced Validation set documents size : %s", np.shape(val_q_balanced))
|
|||
|
logging.debug("Balanced Validation set answers size : %s ", np.shape(val_a_balanced))
|
|||
|
|
|||
|
# prepare the matrix for the embedding layer
|
|||
|
word_index = dictionary.word_index
|
|||
|
embeddings_index = glove.load_glove('', embeddings_size)
|
|||
|
|
|||
|
num_words = min(max_vocabulary_size, 1 + len(word_index))
|
|||
|
|
|||
|
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
|
|||
|
|
|||
|
embedding_matrix = np.zeros((num_words, embeddings_size))
|
|||
|
for word, i in word_index.items():
|
|||
|
if i >= num_words:
|
|||
|
continue
|
|||
|
embedding_vector = embeddings_index.get(word)
|
|||
|
if embedding_vector is not None:
|
|||
|
# words not found in embedding index will be all-zeros.
|
|||
|
embedding_matrix[i] = embedding_vector
|
|||
|
|
|||
|
return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, \
|
|||
|
[val_q_balanced, val_a_balanced], val_y_balanced, embedding_matrix, dictionary
|
|||
|
|
|||
|
|
|||
|
def prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
|
|||
|
max_document_length=35000,
|
|||
|
max_vocabulary_size=5000,
|
|||
|
embeddings_size=50,
|
|||
|
stem_test = False):
|
|||
|
"""
|
|||
|
Prepares a dataset for use by a sequential, categorical model.
|
|||
|
|
|||
|
:param train_doc: the training documents
|
|||
|
:param train_answer: the KPs for the training documents
|
|||
|
:param test_doc: the test documents
|
|||
|
:param test_answer: the KPs for the test documents
|
|||
|
:param val_doc: the validation documents (can be None)
|
|||
|
:param val_answer: the KPs for the validation documents (can be None)
|
|||
|
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
|
|||
|
:param max_vocabulary_size: the maximum size of the vocabulary to use
|
|||
|
(i.e. we keep only the top max_vocabulary_size words)
|
|||
|
:param embeddings_size: the size of the GLoVE embeddings to use
|
|||
|
:param stem_test: set the value to True if the test set answers are stemmed
|
|||
|
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
|
|||
|
test and validation set, and an embedding matrix for an Embedding layer
|
|||
|
"""
|
|||
|
|
|||
|
train_answer_seq = make_sequential(train_doc, train_answer)
|
|||
|
|
|||
|
if not stem_test:
|
|||
|
test_answer_seq = make_sequential(test_doc, test_answer)
|
|||
|
else:
|
|||
|
import copy
|
|||
|
stemmed_test_doc = copy.deepcopy(test_doc)
|
|||
|
stemmed_test_doc = stem_dataset(stemmed_test_doc)
|
|||
|
test_answer_seq = make_sequential(stemmed_test_doc,test_answer)
|
|||
|
|
|||
|
# Prepare validation return data
|
|||
|
val_x = None
|
|||
|
val_y = None
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_answer_seq = make_sequential(val_doc, val_answer)
|
|||
|
|
|||
|
# Transform the documents to sequence
|
|||
|
documents_full = []
|
|||
|
train_y = []
|
|||
|
test_y = []
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_y = []
|
|||
|
|
|||
|
for key, doc in train_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
train_y.append(train_answer_seq[key])
|
|||
|
for key, doc in test_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
test_y.append(test_answer_seq[key])
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
for key, doc in val_doc.items():
|
|||
|
documents_full.append(token for token in doc)
|
|||
|
val_y.append(val_answer_seq[key])
|
|||
|
|
|||
|
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
|
|||
|
|
|||
|
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
|
|||
|
dictionary.fit_on_texts(documents_full)
|
|||
|
|
|||
|
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
|
|||
|
|
|||
|
# Now we can prepare the actual input
|
|||
|
train_x = dictionary.texts_to_sequences(train_doc.values())
|
|||
|
test_x = dictionary.texts_to_sequences(test_doc.values())
|
|||
|
if val_doc and val_answer:
|
|||
|
val_x = dictionary.texts_to_sequences(val_doc.values())
|
|||
|
|
|||
|
logging.debug("Longest training document : %s tokens" % len(max(train_x, key=len)))
|
|||
|
logging.debug("Longest test document : %s tokens" % len(max(test_x, key=len)))
|
|||
|
if val_doc and val_answer:
|
|||
|
logging.debug("Longest validation document : %s tokens" % len(max(val_x, key=len)))
|
|||
|
|
|||
|
train_x = np.asarray(pad_sequences(train_x, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
train_y = pad_sequences(train_y, maxlen=max_document_length, padding='post', truncating='post')
|
|||
|
train_y = make_categorical(train_y)
|
|||
|
|
|||
|
test_x = np.asarray(pad_sequences(test_x, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
test_y = pad_sequences(test_y, maxlen=max_document_length, padding='post', truncating='post')
|
|||
|
test_y = make_categorical(test_y)
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
val_x = np.asarray(pad_sequences(val_x, maxlen=max_document_length, padding='post', truncating='post'))
|
|||
|
val_y = pad_sequences(val_y, maxlen=max_document_length, padding='post', truncating='post')
|
|||
|
val_y = make_categorical(val_y)
|
|||
|
|
|||
|
logging.debug("Training set samples size : %s", np.shape(train_x))
|
|||
|
logging.debug("Training set answers size : %s", np.shape(train_y))
|
|||
|
logging.debug("Test set samples size : %s", np.shape(test_x))
|
|||
|
logging.debug("Test set answers size : %s ", np.shape(test_y))
|
|||
|
|
|||
|
if val_doc and val_answer:
|
|||
|
logging.debug("Validation set samples size : %s", np.shape(val_x))
|
|||
|
logging.debug("Validation set answers size : %s ", np.shape(val_y))
|
|||
|
|
|||
|
# prepare the matrix for the embedding layer
|
|||
|
word_index = dictionary.word_index
|
|||
|
embeddings_index = glove.load_glove('', embeddings_size)
|
|||
|
|
|||
|
num_words = min(max_vocabulary_size, 1 + len(word_index))
|
|||
|
|
|||
|
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
|
|||
|
|
|||
|
embedding_matrix = np.zeros((num_words, embeddings_size))
|
|||
|
for word, i in word_index.items():
|
|||
|
if i >= num_words:
|
|||
|
continue
|
|||
|
embedding_vector = embeddings_index.get(word)
|
|||
|
if embedding_vector is not None:
|
|||
|
# words not found in embedding index will be all-zeros.
|
|||
|
embedding_matrix[i] = embedding_vector
|
|||
|
|
|||
|
return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix
|
|||
|
|
|||
|
|
|||
|
def make_sequential(documents, answers):
|
|||
|
"""
|
|||
|
Transform an answer-based dataset (i.e. with a list of
|
|||
|
documents and a list of keyphrases) to a sequential, ner-like
|
|||
|
dataset, i.e. where the answer set for each document is composed
|
|||
|
by the lists of the documents' tokens marked as non-keyphrase (0),
|
|||
|
beginning of keyphrase (1) and inside-keyphrase (2).
|
|||
|
|
|||
|
For example, for the tokens
|
|||
|
|
|||
|
"I am a python developer since today."
|
|||
|
|
|||
|
If the keyphrases are "python developer" and "today"" the answer
|
|||
|
set for these tokens is
|
|||
|
|
|||
|
"[0 0 0 1 2 0 1]"
|
|||
|
|
|||
|
:param documents: the list of documents
|
|||
|
:param answers: the list of keyphrases
|
|||
|
:return: the new answer set
|
|||
|
"""
|
|||
|
|
|||
|
seq_answers = {}
|
|||
|
|
|||
|
for key, document in documents.items():
|
|||
|
doc_answers_set = answers[key]
|
|||
|
'''
|
|||
|
按关键字的长度排序。 我们首先处理较短的 KP
|
|||
|
如果它们包含在更长的 KP 中,将简单地覆盖
|
|||
|
短的配长的
|
|||
|
'''
|
|||
|
doc_answers_set.sort(key=lambda a: len(a))
|
|||
|
|
|||
|
'''
|
|||
|
该字段将包含答案。
|
|||
|
我们将它初始化为一个零列表,然后我们将填充它
|
|||
|
1s 和 2s 之后
|
|||
|
'''
|
|||
|
doc_answers_seq = [0] * len(document)
|
|||
|
|
|||
|
for answer in doc_answers_set:
|
|||
|
# 查找 KP 出现的第一个单词的位置
|
|||
|
appearances = [i for i, word in enumerate(document) if word == answer[0]]
|
|||
|
for idx in appearances:
|
|||
|
is_kp = True
|
|||
|
# 检查 KP 是否也从它的第二个词开始匹配
|
|||
|
for i in range(1, len(answer)):
|
|||
|
|
|||
|
if (i + idx) < len(document):
|
|||
|
is_kp = answer[i] == document[i + idx]
|
|||
|
else:
|
|||
|
# 文档结尾
|
|||
|
is_kp = False
|
|||
|
|
|||
|
# 如果我们找到了实际的 KP,请在输出列表中标记标记。
|
|||
|
if is_kp:
|
|||
|
doc_answers_seq[idx] = 1
|
|||
|
for i in range(1, len(answer)):
|
|||
|
doc_answers_seq[idx + i] = 2
|
|||
|
|
|||
|
|
|||
|
seq_answers[key] = doc_answers_seq
|
|||
|
|
|||
|
return seq_answers
|
|||
|
|
|||
|
|
|||
|
def make_categorical(x):
|
|||
|
"""
|
|||
|
Transform a two-dimensional list into a 3-dimensional array. The 2nd
|
|||
|
dimension of the input list becomes a one-hot 2D array, e.g.
|
|||
|
if the input is [[1,2,0],...], the output will be
|
|||
|
[[[0,1,0],[0,0,1],[1,0,0]],...]
|
|||
|
|
|||
|
:param x: a 2D-list
|
|||
|
:return: a 3D-numpy array
|
|||
|
"""
|
|||
|
|
|||
|
# 类别数量
|
|||
|
num_categories = max([item for sublist in x for item in sublist]) + 1
|
|||
|
|
|||
|
# numpy格式输出
|
|||
|
new_x = np.zeros((len(x), len(x[0]), num_categories))
|
|||
|
|
|||
|
# 使用keras进行实际的分类转换
|
|||
|
i = 0
|
|||
|
for doc in x:
|
|||
|
new_doc = np_utils.to_categorical(doc, num_classes=num_categories)
|
|||
|
new_x[i] = new_doc
|
|||
|
i += 1
|
|||
|
|
|||
|
return new_x
|
|||
|
|
|||
|
|
|||
|
def stem_dataset(dataset):
|
|||
|
|
|||
|
from nltk.stem import PorterStemmer
|
|||
|
stemmer = PorterStemmer()
|
|||
|
|
|||
|
for key, tokens in dataset.items():
|
|||
|
stemmed_tokens = [stemmer.stem(token) for token in tokens]
|
|||
|
dataset[key] = stemmed_tokens
|
|||
|
|
|||
|
return dataset
|