first commit
This commit is contained in:
parent
850c1fc3f4
commit
7e701970c7
|
@ -0,0 +1,173 @@
|
|||
import os,sys
|
||||
os.chdir(sys.path[0])
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
from datasets import EnNews
|
||||
|
||||
import tensorflow as tf
|
||||
from keras.backend.tensorflow_backend import set_session
|
||||
|
||||
import numpy as np
|
||||
import random as rn
|
||||
|
||||
np.random.seed(421)
|
||||
rn.seed(12345)
|
||||
|
||||
import logging
|
||||
|
||||
from keras import regularizers
|
||||
from keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM, TimeDistributed
|
||||
|
||||
from keras.models import Sequential, load_model
|
||||
|
||||
from datasets import *
|
||||
from eval import keras_metrics, metrics
|
||||
from nlp import tokenizer as tk
|
||||
from utils import info, preprocessing, postprocessing, plots
|
||||
|
||||
# 记录配置
|
||||
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s\t%(levelname)s\t%(message)s',
|
||||
level=logging.DEBUG)
|
||||
|
||||
info.log_versions()
|
||||
|
||||
# 全局变量
|
||||
|
||||
SAVE_MODEL = False
|
||||
MODEL_PATH = "models/bilstm.h5"
|
||||
SHOW_PLOTS = False
|
||||
|
||||
# 数据集和超参数
|
||||
Dataset = EnNews
|
||||
|
||||
rootpath = "/home/zhangxj/WorkFile/本科毕业设计"
|
||||
|
||||
tokenizer = tk.tokenizers.nltk
|
||||
DATASET_FOLDER = rootpath+"/EnergyNews"
|
||||
MAX_DOCUMENT_LENGTH = 400
|
||||
MAX_VOCABULARY_SIZE = 20000
|
||||
EMBEDDINGS_SIZE = 50
|
||||
batch_size = 32
|
||||
epochs = 20
|
||||
KP_WEIGHT = 10
|
||||
STEM_MODE = metrics.stemMode.both
|
||||
STEM_TEST = False
|
||||
|
||||
|
||||
|
||||
# 加载数据集
|
||||
logging.info("Loading dataset...")
|
||||
|
||||
data = Dataset(DATASET_FOLDER)
|
||||
|
||||
train_doc_str, train_answer_str = data.load_train()
|
||||
test_doc_str, test_answer_str = data.load_test()
|
||||
val_doc_str, val_answer_str = data.load_validation()
|
||||
|
||||
train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str, tokenizer)
|
||||
test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str, tokenizer)
|
||||
val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer)
|
||||
|
||||
# 完整性检查
|
||||
|
||||
logging.info("Dataset loaded. Preprocessing data...")
|
||||
|
||||
train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix = preprocessing. \
|
||||
prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
|
||||
max_document_length=MAX_DOCUMENT_LENGTH,
|
||||
max_vocabulary_size=MAX_VOCABULARY_SIZE,
|
||||
embeddings_size=EMBEDDINGS_SIZE,
|
||||
stem_test=STEM_TEST)
|
||||
|
||||
# 权重训练示例:所有不是 kp的内容
|
||||
from sklearn.utils import class_weight
|
||||
|
||||
train_y_weights = np.argmax(train_y, axis=2)
|
||||
train_y_weights = np.reshape(class_weight.compute_sample_weight('balanced', train_y_weights.flatten()),
|
||||
np.shape(train_y_weights))
|
||||
|
||||
logging.info("数据预处理完成")
|
||||
logging.info("可能的最大召回率: %s",
|
||||
metrics.recall(test_answer,
|
||||
postprocessing.get_words(test_doc, postprocessing.undo_sequential(test_y)),
|
||||
STEM_MODE))
|
||||
|
||||
if not SAVE_MODEL or not os.path.isfile(MODEL_PATH):
|
||||
|
||||
logging.debug("建立网络...")
|
||||
model = Sequential()
|
||||
print("-------",np.shape(embedding_matrix)[0])
|
||||
embedding_layer = Embedding(np.shape(embedding_matrix)[0],
|
||||
EMBEDDINGS_SIZE,
|
||||
weights=[embedding_matrix],
|
||||
input_length=MAX_DOCUMENT_LENGTH,
|
||||
trainable=False)
|
||||
|
||||
model.add(embedding_layer)
|
||||
model.add(Bidirectional(LSTM(300, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
|
||||
model.add(Dropout(0.25))
|
||||
model.add(TimeDistributed(Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01))))
|
||||
model.add(Dropout(0.25))
|
||||
model.add(TimeDistributed(Dense(2, activation='softmax')))
|
||||
|
||||
logging.info("编译网络...")
|
||||
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
|
||||
sample_weight_mode="temporal")
|
||||
print(model.summary())
|
||||
|
||||
metrics_callback = keras_metrics.MetricsCallback(val_x, val_y)
|
||||
|
||||
logging.info("拟合网络...")
|
||||
|
||||
history = model.fit(train_x, train_y,
|
||||
validation_data=(val_x, val_y),
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
sample_weight=train_y_weights,
|
||||
callbacks=[metrics_callback])
|
||||
|
||||
if SHOW_PLOTS:
|
||||
plots.plot_accuracy(history)
|
||||
plots.plot_loss(history)
|
||||
plots.plot_prf(metrics_callback)
|
||||
|
||||
if SAVE_MODEL:
|
||||
model.save(MODEL_PATH)
|
||||
logging.info("模型保存路径 in %s", MODEL_PATH)
|
||||
|
||||
else:
|
||||
logging.info("加载模型 %s...", MODEL_PATH)
|
||||
model = load_model(MODEL_PATH)
|
||||
logging.info("加载模型完成")
|
||||
|
||||
logging.info("在测试集上预测...")
|
||||
output = model.predict(x=test_x, verbose=1)
|
||||
logging.debug("输出格式: %s", np.shape(output))
|
||||
|
||||
obtained_tokens = postprocessing.undo_sequential(output)
|
||||
obtained_words = postprocessing.get_words(test_doc, obtained_tokens)
|
||||
|
||||
precision = metrics.precision(test_answer, obtained_words,STEM_MODE)
|
||||
recall = metrics.recall(test_answer, obtained_words,STEM_MODE)
|
||||
f1 = metrics.f1(precision, recall)
|
||||
|
||||
print("### 获得的分数 ###")
|
||||
print("###")
|
||||
print("### Precision : %.4f" % precision)
|
||||
print("### Recall : %.4f" % recall)
|
||||
print("### F1 : %.4f" % f1)
|
||||
print("### ###")
|
||||
|
||||
keras_precision = keras_metrics.keras_precision(test_y, output)
|
||||
keras_recall = keras_metrics.keras_recall(test_y, output)
|
||||
keras_f1 = keras_metrics.keras_f1(test_y, output)
|
||||
|
||||
print("### 获得的分数 ###")
|
||||
print("###")
|
||||
print("### Precision : %.4f" % keras_precision)
|
||||
print("### Recall : %.4f" % keras_recall)
|
||||
print("### F1 : %.4f" % keras_f1)
|
||||
print("### ###")
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
import logging
|
||||
import os
|
||||
|
||||
from nlp import tokenizer as tk
|
||||
|
||||
|
||||
class Dataset(object):
|
||||
"""
|
||||
An abstract class that represents a dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, name, path):
|
||||
self.path = path
|
||||
self.name = name
|
||||
self.test_documents = None
|
||||
self.test_answers = None
|
||||
self.train_documents = None
|
||||
self.train_answers = None
|
||||
self.validation_documents = None
|
||||
self.validation_answers = None
|
||||
|
||||
logging.debug("初始化数据集 %s 文件夹路径 %s" %
|
||||
(self.name, self.path))
|
||||
|
||||
def __str__(self):
|
||||
return '数据集 %s 所在路径 %s' % (self.name, self.path)
|
||||
|
||||
def _load_test_documents(self):
|
||||
"""
|
||||
Loads the test documents.
|
||||
|
||||
:return: a list of documents.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _load_test_answers(self):
|
||||
"""
|
||||
Loads the answers for the test documents.
|
||||
:return: a list of answers.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _load_train_documents(self):
|
||||
"""
|
||||
Loads the train documents.
|
||||
|
||||
:return: a list of documents.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _load_train_answers(self):
|
||||
"""
|
||||
Loads the answers for the train documents.
|
||||
:return: a list of answers.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _load_validation_documents(self):
|
||||
"""
|
||||
Loads the validation documents.
|
||||
|
||||
:return: a list of documents.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _load_validation_answers(self):
|
||||
"""
|
||||
Loads the answers for the validation documents.
|
||||
:return: a list of answers.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def load_test(self):
|
||||
"""
|
||||
Loads the test documents and their answers.
|
||||
:return: a tuple containing the test documents and the test answers.
|
||||
"""
|
||||
|
||||
if not self.test_documents:
|
||||
self.test_documents = self._load_test_documents()
|
||||
|
||||
if not self.test_answers:
|
||||
self.test_answers = self._load_test_answers()
|
||||
|
||||
assert (len(self.test_documents) == len(self.test_answers)), \
|
||||
"You have not enough (or too many) test answers for your documents!"
|
||||
|
||||
logging.debug("为数据集加载测试集 %s" % self.name)
|
||||
|
||||
return self.test_documents, self.test_answers
|
||||
|
||||
def load_train(self):
|
||||
"""
|
||||
Loads the training documents and their answers.
|
||||
:return: a tuple containing the train documents and the training answers.
|
||||
"""
|
||||
if not self.train_documents:
|
||||
self.train_documents = self._load_train_documents()
|
||||
|
||||
if not self.train_answers:
|
||||
self.train_answers = self._load_train_answers()
|
||||
|
||||
assert (len(self.train_documents) == len(self.train_answers)), \
|
||||
"You have not enough (or too many) train answers for your documents!"
|
||||
|
||||
logging.debug("为数据集加载训练集 %s" % self.name)
|
||||
|
||||
return self.train_documents, self.train_answers
|
||||
|
||||
def load_validation(self):
|
||||
"""
|
||||
Loads the validation documents and their answers.
|
||||
:return: a tuple containing the validation documents and the training answers.
|
||||
"""
|
||||
if not self.validation_documents:
|
||||
self.validation_documents = self._load_validation_documents()
|
||||
|
||||
if not self.validation_answers:
|
||||
self.validation_answers = self._load_validation_answers()
|
||||
|
||||
assert (not self.validation_answers and not self.validation_answers) or \
|
||||
(len(self.validation_documents) == len(self.validation_answers)), \
|
||||
"You have not enough (or too many) validation answers for your documents!"
|
||||
|
||||
logging.debug("为数据集加载验证集 %s" % self.name)
|
||||
|
||||
return self.validation_documents, self.validation_answers
|
||||
|
||||
|
||||
class EnNews(Dataset):
|
||||
"""
|
||||
Dataset from Annette Hulth's "Improved Automatic Keyword Extraction
|
||||
Given More Linguistic Knowledge"
|
||||
|
||||
Note: to make the results obtained with this dataset comparable to
|
||||
the ones described in Hulth's paper, only the "uncontrolled" terms
|
||||
are used.
|
||||
|
||||
Full-text here: http://www.aclweb.org/anthology/W03-1028
|
||||
"""
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__("EnergyNews", path)
|
||||
|
||||
def __load_documents(self, folder):
|
||||
"""
|
||||
Loads the documents in the .abstr files contained
|
||||
in the specified folder and puts them in a dictionary
|
||||
indexed by document id (i.e. the filename without the
|
||||
extension).
|
||||
|
||||
:param folder: the folder containing the documents
|
||||
:return: a dictionary with the documents
|
||||
"""
|
||||
|
||||
# This dictionary will contain the documents
|
||||
documents = {}
|
||||
|
||||
for doc in os.listdir("%s/%s" % (self.path, folder)):
|
||||
if doc.endswith(".clr"):
|
||||
content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
|
||||
documents[doc[:doc.find('.')]] = content
|
||||
|
||||
return documents
|
||||
|
||||
def __load_answers(self, folder):
|
||||
"""
|
||||
Loads the answers contained in the .contr and .uncontr files
|
||||
and puts them in a dictionary indexed by document ID
|
||||
(i.e. the document name without the extension)
|
||||
:param folder: the folder containing the answer files
|
||||
:return: a dictionary with the answers
|
||||
"""
|
||||
|
||||
# This dictionary will contain the answers
|
||||
answers = {}
|
||||
|
||||
for doc in os.listdir("%s/%s" % (self.path, folder)):
|
||||
if doc.endswith(".key"):
|
||||
content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
|
||||
retrieved_answers = content.split(' ')
|
||||
doc_id = doc[:doc.find('.')]
|
||||
for answer in retrieved_answers:
|
||||
answer = answer.strip() # 移除字符串头尾指定的字符(默认为空格或换行符)
|
||||
if doc_id not in answers:
|
||||
answers[doc_id] = [answer]
|
||||
else:
|
||||
answers[doc_id].append(answer)
|
||||
|
||||
return answers
|
||||
|
||||
def _load_test_documents(self):
|
||||
return self.__load_documents("test")
|
||||
|
||||
def _load_train_documents(self):
|
||||
return self.__load_documents("train")
|
||||
|
||||
def _load_validation_documents(self):
|
||||
return self.__load_documents("validation")
|
||||
|
||||
def _load_test_answers(self):
|
||||
return self.__load_answers("test")
|
||||
|
||||
def _load_train_answers(self):
|
||||
return self.__load_answers("train")
|
||||
|
||||
def _load_validation_answers(self):
|
||||
return self.__load_answers("validation")
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
|
||||
|
||||
def write_anno(output_folder, documents, keyphrases):
|
||||
# create output directory if not exists
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
|
||||
for doc_id, doc_string in documents.items():
|
||||
|
||||
i = 0
|
||||
output_file = open("%s/%s.%s" % (output_folder, doc_id, "ann"), "w")
|
||||
|
||||
for kp in keyphrases[doc_id]:
|
||||
kp_string = ' '.join(kp)
|
||||
|
||||
for start_index in list(find_all(doc_string, kp_string)):
|
||||
end_index = start_index + len(kp_string)
|
||||
output_file.write("T%s\t%s %s %s\t%s\n" %
|
||||
(i, "NO_TYPE", start_index, end_index, kp_string))
|
||||
|
||||
output_file.close()
|
||||
|
||||
|
||||
def find_all(target_string, substring):
|
||||
start = 0
|
||||
while True:
|
||||
start = target_string.find(substring, start)
|
||||
if start == -1: return
|
||||
yield start
|
||||
start += 1
|
|
@ -0,0 +1,220 @@
|
|||
import keras
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
|
||||
class MetricsCallback(keras.callbacks.Callback):
|
||||
|
||||
def __init__(self,val_x,val_y):
|
||||
self.val_x = val_x
|
||||
self.val_y = val_y
|
||||
self.epoch = []
|
||||
self.history = {}
|
||||
|
||||
def on_epoch_end(self, epoch, logs={}):
|
||||
|
||||
# Predict on the validation data
|
||||
y_pred = self.model.predict(self.val_x)
|
||||
|
||||
precision = keras_precision(self.val_y,y_pred)
|
||||
recall = keras_recall(self.val_y, y_pred)
|
||||
f1 = keras_f1(self.val_y, y_pred)
|
||||
|
||||
print("")
|
||||
print("### Validation Scores ###")
|
||||
print("###")
|
||||
print("### Epoch : %s" % (epoch+1))
|
||||
print("### Precision : %.4f" % precision)
|
||||
print("### Recall : %.4f" % recall)
|
||||
print("### F1 : %.4f" % f1)
|
||||
print("### ###")
|
||||
|
||||
self.epoch.append(epoch+1)
|
||||
self.history.setdefault("precision", []).append(precision)
|
||||
self.history.setdefault("recall", []).append(recall)
|
||||
self.history.setdefault("f1", []).append(f1)
|
||||
|
||||
|
||||
class MetricsCallbackQA(keras.callbacks.Callback):
|
||||
|
||||
def __init__(self,val_x,val_y,batch_size = 128):
|
||||
self.val_x = val_x
|
||||
self.val_y = val_y
|
||||
self.epoch = []
|
||||
self.history = {}
|
||||
self.batch_size = batch_size
|
||||
|
||||
def on_epoch_end(self, epoch, logs={}):
|
||||
|
||||
# Predict on the validation data
|
||||
y_pred = self.model.predict(self.val_x,batch_size=self.batch_size,verbose=1)
|
||||
|
||||
precision = keras_precision_qa(self.val_y,y_pred)
|
||||
recall = keras_recall_qa(self.val_y, y_pred)
|
||||
f1 = keras_f1_qa(self.val_y, y_pred)
|
||||
|
||||
print("")
|
||||
print("### Validation Scores ###")
|
||||
print("###")
|
||||
print("### Epoch : %s" % (epoch+1))
|
||||
print("### Precision : %.4f" % precision)
|
||||
print("### Recall : %.4f" % recall)
|
||||
print("### F1 : %.4f" % f1)
|
||||
print("### ###")
|
||||
|
||||
self.epoch.append(epoch+1)
|
||||
self.history.setdefault("precision", []).append(precision)
|
||||
self.history.setdefault("recall", []).append(recall)
|
||||
self.history.setdefault("f1", []).append(f1)
|
||||
|
||||
def keras_precision(y_true,y_pred) :
|
||||
|
||||
true_positives = 0
|
||||
false_positives = 0
|
||||
|
||||
# reduce dimensionality
|
||||
y_true_2d = np.argmax(y_true,axis=2)
|
||||
y_pred_2d = np.argmax(y_pred,axis=2)
|
||||
|
||||
y_true_indices = {}
|
||||
|
||||
for i in range(np.shape(y_true_2d)[0]):
|
||||
doc_true_indices = []
|
||||
in_word = False
|
||||
|
||||
for j in range(np.shape(y_true_2d)[1]):
|
||||
if y_true_2d[i][j] == 1 :
|
||||
doc_true_indices.append(["%s" % j])
|
||||
in_word = True
|
||||
elif j > 0 and y_true_2d[i][j] == 2 and in_word:
|
||||
doc_true_indices[len(doc_true_indices) -1].append(",%s" % j)
|
||||
else:
|
||||
in_word = False
|
||||
|
||||
y_true_indices[i] = doc_true_indices
|
||||
|
||||
y_pred_indices = {}
|
||||
|
||||
for i in range(np.shape(y_pred_2d)[0]):
|
||||
doc_true_indices = []
|
||||
in_word = False
|
||||
for j in range(np.shape(y_pred_2d)[1]):
|
||||
|
||||
if y_pred_2d[i][j] == 1:
|
||||
doc_true_indices.append(["%s" % j])
|
||||
in_word = True
|
||||
elif j > 0 and y_pred_2d[i][j] == 2 and in_word:
|
||||
doc_true_indices[len(doc_true_indices) - 1].append(",%s" % j)
|
||||
else :
|
||||
in_word = False
|
||||
|
||||
y_pred_indices[i] = doc_true_indices
|
||||
|
||||
for i in range(len(y_pred_indices)) :
|
||||
for kp in y_pred_indices[i]:
|
||||
if kp in y_true_indices[i]:
|
||||
true_positives += 1
|
||||
else :
|
||||
false_positives += 1
|
||||
|
||||
return (1.0 * true_positives) / (true_positives + false_positives) \
|
||||
if true_positives + false_positives > 0 else 0
|
||||
|
||||
def keras_recall(y_true,y_pred) :
|
||||
|
||||
true_positives = 0
|
||||
false_positives = 0
|
||||
|
||||
# reduce dimensionality
|
||||
y_true_2d = np.argmax(y_true,axis=2)
|
||||
y_pred_2d = np.argmax(y_pred,axis=2)
|
||||
|
||||
y_true_indices = {}
|
||||
|
||||
for i in range(np.shape(y_true_2d)[0]):
|
||||
doc_true_indices = []
|
||||
in_word = False
|
||||
|
||||
for j in range(np.shape(y_true_2d)[1]):
|
||||
if y_true_2d[i][j] == 1 :
|
||||
doc_true_indices.append(["%s" % j])
|
||||
in_word = True
|
||||
elif j > 0 and y_true_2d[i][j] == 2 and in_word:
|
||||
doc_true_indices[len(doc_true_indices) -1].append(",%s" % j)
|
||||
else:
|
||||
in_word = False
|
||||
|
||||
y_true_indices[i] = doc_true_indices
|
||||
|
||||
y_pred_indices = {}
|
||||
|
||||
for i in range(np.shape(y_pred_2d)[0]):
|
||||
doc_true_indices = []
|
||||
in_word = False
|
||||
for j in range(np.shape(y_pred_2d)[1]):
|
||||
|
||||
if y_pred_2d[i][j] == 1:
|
||||
doc_true_indices.append(["%s" % j])
|
||||
in_word = True
|
||||
elif j > 0 and y_pred_2d[i][j] == 2 and in_word:
|
||||
doc_true_indices[len(doc_true_indices) - 1].append(",%s" % j)
|
||||
else :
|
||||
in_word = False
|
||||
|
||||
y_pred_indices[i] = doc_true_indices
|
||||
|
||||
for i in range(len(y_pred_indices)) :
|
||||
for kp in y_pred_indices[i]:
|
||||
if kp in y_true_indices[i]:
|
||||
true_positives += 1
|
||||
|
||||
return (1.0 * true_positives) / sum(len(kps) for doc,kps in y_true_indices.items())
|
||||
|
||||
|
||||
def keras_f1(y_true,y_pred):
|
||||
p = keras_precision(y_true,y_pred)
|
||||
r = keras_recall(y_true,y_pred)
|
||||
return (2*(p * r)) / (p + r) if p != 0 and r != 0 else 0
|
||||
|
||||
|
||||
def keras_precision_qa(y_true,y_pred) :
|
||||
|
||||
# Prepare data
|
||||
if np.shape(y_pred)[1] == 2:
|
||||
# If one-hot prediction...
|
||||
y_true = np.argmax(y_true,axis=1)
|
||||
y_pred = np.argmax(y_pred,axis=1)
|
||||
|
||||
else:
|
||||
# If similarity-based...
|
||||
y_pred = np.reshape(y_pred, np.shape(y_true))
|
||||
y_pred = np.round(y_pred)
|
||||
|
||||
den = np.count_nonzero(y_pred)
|
||||
|
||||
if den == 0:
|
||||
logging.log(logging.WARNING,"Network did not predict any positive sample")
|
||||
return 0
|
||||
|
||||
return np.count_nonzero(np.in1d(np.where(y_pred), np.where(y_true))) / den
|
||||
|
||||
|
||||
def keras_recall_qa(y_true,y_pred) :
|
||||
# Prepare data
|
||||
if np.shape(y_pred)[1] == 2:
|
||||
# If one-hot prediction...
|
||||
y_true = np.argmax(y_true, axis=1)
|
||||
y_pred = np.argmax(y_pred, axis=1)
|
||||
|
||||
else:
|
||||
# If similarity-based...
|
||||
y_pred = np.reshape(y_pred, np.shape(y_true))
|
||||
y_pred = np.round(y_pred)
|
||||
|
||||
return np.count_nonzero(np.in1d(np.where(y_true), np.where(y_pred))) / np.count_nonzero(y_true)
|
||||
|
||||
|
||||
def keras_f1_qa(y_true,y_pred):
|
||||
p = keras_precision_qa(y_true,y_pred)
|
||||
r = keras_recall_qa(y_true,y_pred)
|
||||
return (2*(p * r)) / (p + r) if p + r > 0 else 0
|
|
@ -0,0 +1,82 @@
|
|||
from enum import Enum
|
||||
from nltk.stem import *
|
||||
|
||||
stemMode = Enum("StemmerMode","none both results")
|
||||
|
||||
|
||||
def precision(reference,obtained,stem = stemMode.none):
|
||||
|
||||
true_positives = 0
|
||||
false_positives = 0
|
||||
|
||||
for doc, reference_kps_tokens in reference.items():
|
||||
obtained_kps_tokens = obtained[doc]
|
||||
|
||||
reference_kps = []
|
||||
obtained_kps = []
|
||||
|
||||
for ref_tokens in reference_kps_tokens:
|
||||
|
||||
if stem == stemMode.both:
|
||||
stemmer = PorterStemmer()
|
||||
ref_tokens = [stemmer.stem(token) for token in ref_tokens]
|
||||
|
||||
reference_kp = ' '.join(ref_tokens)
|
||||
reference_kps.append(reference_kp.lower())
|
||||
|
||||
for obt_tokens in obtained_kps_tokens:
|
||||
|
||||
if stem == stemMode.both or stem == stemMode.results:
|
||||
stemmer = PorterStemmer()
|
||||
obt_tokens = [stemmer.stem(token) for token in obt_tokens]
|
||||
|
||||
obt_string = ' '.join(obt_tokens).lower()
|
||||
if obt_string not in obtained_kps:
|
||||
# this is necessary, because if we stem the kps we may
|
||||
# obtain duplicates
|
||||
obtained_kps.append(obt_string)
|
||||
|
||||
for obt_string in obtained_kps:
|
||||
if obt_string in reference_kps:
|
||||
true_positives += 1
|
||||
else:
|
||||
false_positives += 1
|
||||
|
||||
return (true_positives * 1.0) / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
|
||||
|
||||
|
||||
def recall(reference,obtained,stem=stemMode.none):
|
||||
|
||||
true_positives = 0
|
||||
total_reference = sum(len(kps) for doc,kps in reference.items())
|
||||
|
||||
for doc, reference_kps_tokens in reference.items():
|
||||
obtained_kps_tokens = obtained[doc]
|
||||
|
||||
reference_kps = []
|
||||
|
||||
for ref_tokens in reference_kps_tokens:
|
||||
|
||||
if stem == stemMode.both:
|
||||
stemmer = PorterStemmer()
|
||||
ref_tokens = [stemmer.stem(token) for token in ref_tokens]
|
||||
|
||||
reference_kp = ' '.join(ref_tokens)
|
||||
reference_kps.append(reference_kp)
|
||||
|
||||
for obt_tokens in obtained_kps_tokens:
|
||||
|
||||
if stem == stemMode.both or stem == stemMode.results:
|
||||
stemmer = PorterStemmer()
|
||||
obt_tokens = [stemmer.stem(token) for token in obt_tokens]
|
||||
|
||||
obt_string = ' '.join(obt_tokens)
|
||||
if obt_string in reference_kps:
|
||||
true_positives += 1
|
||||
reference_kps.remove(obt_string)
|
||||
|
||||
return (true_positives * 1.0) / total_reference
|
||||
|
||||
|
||||
def f1(precision, recall):
|
||||
return (2 * (precision * recall)) / (precision + recall) if precision + recall > 0 else 0
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,95 @@
|
|||
import nltk
|
||||
from nltk.chunk.regexp import *
|
||||
from nlp import tokenizer as tk
|
||||
|
||||
|
||||
KP_REGEX_1 = "<JJ|NN|NNP|NNS|NNPS>*<NN|NNP|NNS|NNPS|VB|VBG>"
|
||||
KP_REGEX_2 = "<JJ>?<NN|NNS>+<IN><NN|NNS>"
|
||||
KP_REGEX_3 = "<JJ|VBN>*<NN|NNS>"
|
||||
|
||||
noun_phrase_grammar = r"""
|
||||
NBAR:
|
||||
{<NN.*|JJ>*<NN.*|VBG>} # Nouns and Adjectives, terminated with Nouns or -ing verbs
|
||||
|
||||
KP:
|
||||
{<NBAR>}
|
||||
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
|
||||
"""
|
||||
|
||||
hulth_grammar = r"""
|
||||
NBAR:
|
||||
{<NN.*|JJ.*>*<NN.*|VBG>} # Nouns and Adjectives, terminated with Nouns or -ing verbs
|
||||
|
||||
VBPART:
|
||||
{<VBG|VBP><NBAR>} # Verb in participle from, then nouns
|
||||
|
||||
COUNT:
|
||||
{<CD><NBAR>} # Numbers then nouns
|
||||
|
||||
NP:
|
||||
{<NBAR><IN><NBAR>}
|
||||
"""
|
||||
|
||||
hulth_labels = ['NP','NBAR','COUNT','VBPART']
|
||||
|
||||
def extract_candidates_from_set(set,tokenizer):
|
||||
"""
|
||||
Generates the candidate keyphrases for a document.
|
||||
|
||||
:param set: the training, test or validation set
|
||||
:param tokenizer: which tokenizer to use
|
||||
:return: a dictionary where each document is associated with its candidate keyphrases
|
||||
"""
|
||||
|
||||
candidates = {}
|
||||
for doc, str in set.items() :
|
||||
candidates[doc] = extract_candidates(str,tokenizer)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def extract_candidates(document,tokenizer):
|
||||
"""
|
||||
Extracts the candidate keyphrases from a string.
|
||||
|
||||
:param document: the string to analyze
|
||||
:param tokenizer: the tokenizer to use
|
||||
:return: the list of candidate keyphrases for the input document
|
||||
"""
|
||||
|
||||
return extract_valid_tokens(tk.tokenize(document,tokenizer))
|
||||
|
||||
|
||||
def extract_valid_tokens(tokens):
|
||||
"""
|
||||
Given a list of tokens, returns the subsets of such list which are potential keyphrases according to
|
||||
the provided part-of-speech patterns.
|
||||
|
||||
:param document: the token list to analyze
|
||||
:return: the list of candidate keyphrases for the input document
|
||||
"""
|
||||
|
||||
postagged_doc = nltk.pos_tag(tokens)
|
||||
|
||||
kp_rule_1 = ChunkRule(KP_REGEX_1,"")
|
||||
kp_rule_2 = ChunkRule(KP_REGEX_2, "")
|
||||
kp_rule_3 = ChunkRule(KP_REGEX_3, "")
|
||||
|
||||
#chunk_parser = RegexpChunkParser([kp_rule_1, kp_rule_2, kp_rule_3],
|
||||
# chunk_label="KP")
|
||||
|
||||
chunk_parser = RegexpParser(grammar=hulth_grammar)
|
||||
|
||||
tree = chunk_parser.parse(postagged_doc)
|
||||
|
||||
candidates = []
|
||||
|
||||
for subtree in tree.subtrees():
|
||||
if subtree.label() in hulth_labels:
|
||||
candidate = []
|
||||
for leaf in subtree.leaves():
|
||||
candidate.append(leaf[0])
|
||||
if candidate not in candidates:
|
||||
candidates.append(candidate)
|
||||
|
||||
return candidates
|
|
@ -0,0 +1,38 @@
|
|||
import nltk
|
||||
|
||||
# NLTK 使用 Penn Treebank 标签集
|
||||
# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
|
||||
ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
|
||||
ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]
|
||||
|
||||
|
||||
def clean_tokens(keyphrase):
|
||||
"""
|
||||
Removes the tokens from the head and the tail of a keyphrase +
|
||||
(passed as a token list) that do not match the allowed PoS tags.
|
||||
|
||||
|
||||
:return: the cleaned keyphrase
|
||||
"""
|
||||
|
||||
keyphrase_pos = nltk.pos_tag(keyphrase)
|
||||
|
||||
start = 0
|
||||
|
||||
for start in range(len(keyphrase_pos)):
|
||||
if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
|
||||
start += 1
|
||||
else:
|
||||
break
|
||||
|
||||
end = len(keyphrase) - 1
|
||||
|
||||
for end in range(len(keyphrase_pos) - 1,start,-1):
|
||||
if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
|
||||
end -= 1
|
||||
else:
|
||||
break
|
||||
|
||||
return keyphrase[start:end+1]
|
||||
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
from collections import OrderedDict
|
||||
|
||||
|
||||
class Dictionary(object):
|
||||
"""Dictionary utility class. This class is a lightweight version of the Keras text preprocessing module
|
||||
(see https://github.com/fchollet/keras/blob/master/keras/preprocessing/text.py), designed to work on
|
||||
tokens instead of strings.
|
||||
|
||||
This class is used to build a dictionary that can in turn be used to fill an Embedding layer
|
||||
with word embeddings.
|
||||
|
||||
Please note that `0` is a reserved index that won't be assigned to any word.
|
||||
|
||||
The original keras.preprocessing.text module is licensed under the MIT license.
|
||||
"""
|
||||
|
||||
def __init__(self, num_words=None):
|
||||
|
||||
self.word_counts = OrderedDict()
|
||||
self.word_index = {}
|
||||
self.reverse_word_index = None
|
||||
self.num_words = num_words
|
||||
self.document_count = 0
|
||||
|
||||
def fit_on_texts(self, tokenized_documents):
|
||||
|
||||
for document in tokenized_documents:
|
||||
self.document_count += 1
|
||||
|
||||
for w in document:
|
||||
if w in self.word_counts:
|
||||
self.word_counts[w] += 1
|
||||
else:
|
||||
self.word_counts[w] = 1
|
||||
|
||||
wcounts = list(self.word_counts.items())
|
||||
wcounts.sort(key=lambda x: x[1], reverse=True)
|
||||
sorted_voc = [wc[0] for wc in wcounts]
|
||||
# note that index 0 is reserved, never assigned to an existing word
|
||||
self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
|
||||
|
||||
def texts_to_sequences(self, texts):
|
||||
"""
|
||||
Transforms each text in texts in a sequence of integers.
|
||||
|
||||
Only top "num_words" most frequent words will be taken into account.
|
||||
|
||||
:param texts: A list of words
|
||||
:return: A list of sequences.
|
||||
"""
|
||||
texts_sequences = []
|
||||
for text in texts:
|
||||
texts_sequences.append(self.token_list_to_sequence(text))
|
||||
return texts_sequences
|
||||
|
||||
def token_list_to_sequence(self, tokens):
|
||||
"""Transforms each text in texts in a sequence of integers.
|
||||
|
||||
Only top "num_words" most frequent words will be taken into account.
|
||||
Only words known by the tokenizer will be taken into account.
|
||||
|
||||
# Arguments
|
||||
tokens: A list of texts (strings).
|
||||
|
||||
# Yields
|
||||
Yields individual sequences.
|
||||
"""
|
||||
vect = []
|
||||
for w in tokens:
|
||||
|
||||
i = self.word_index.get(w)
|
||||
if i is not None:
|
||||
if self.num_words and i >= self.num_words:
|
||||
continue
|
||||
else:
|
||||
vect.append(i)
|
||||
return vect
|
||||
|
||||
def tokens_to_words(self, tokens):
|
||||
"""
|
||||
Utility that prints the words associated to the provided indices.
|
||||
|
||||
:param tokens: a list of integers
|
||||
"""
|
||||
|
||||
if not self.reverse_word_index:
|
||||
self.build_reverse_word_index()
|
||||
|
||||
words = []
|
||||
|
||||
for token in tokens:
|
||||
if token != 0:
|
||||
words.append(self.reverse_word_index[token])
|
||||
|
||||
return words
|
||||
|
||||
def build_reverse_word_index(self):
|
||||
|
||||
self.reverse_word_index = {}
|
||||
for key, value in self.word_index.items():
|
||||
self.reverse_word_index[value] = key
|
|
@ -0,0 +1,37 @@
|
|||
from enum import Enum
|
||||
import keras.preprocessing.text
|
||||
import nltk
|
||||
|
||||
tokenizers = Enum("Tokenizers","nltk keras")
|
||||
|
||||
|
||||
def tokenize_set(documents,answers,tokenizer):
|
||||
|
||||
tokenized_docs = {}
|
||||
for doc, str in documents.items():
|
||||
tokenized_docs[doc] = tokenize(str, tokenizer)
|
||||
|
||||
tokenized_answers = {}
|
||||
for doc, answers in answers.items():
|
||||
for answer in answers :
|
||||
if doc not in tokenized_answers:
|
||||
tokenized_answers[doc] = [tokenize(answer,tokenizer)]
|
||||
else:
|
||||
tokenized_answers[doc].append(tokenize(answer,tokenizer))
|
||||
|
||||
return tokenized_docs,tokenized_answers
|
||||
|
||||
def tokenize(string,tokenizer = tokenizers.keras):
|
||||
"""
|
||||
Tokenizes a string using the selected tokenizer.
|
||||
:param string: the string to tokenize
|
||||
:param tokenizer: which tokenizer to use (nltk or keras)
|
||||
:return: the list of tokens
|
||||
"""
|
||||
|
||||
if tokenizer == tokenizers.nltk:
|
||||
return nltk.word_tokenize(string.lower())
|
||||
elif tokenizer == tokenizers.keras:
|
||||
return keras.preprocessing.text.text_to_word_sequence(string)
|
||||
else:
|
||||
raise NotImplementedError()
|
|
@ -0,0 +1,25 @@
|
|||
import os,sys
|
||||
os.chdir(sys.path[0]) #相对路径
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
|
||||
# 加载词向量
|
||||
def load_glove(glove_dir,size):
|
||||
embeddings_index = {}
|
||||
glove_path = ("/home/zhangxj/WorkFile/本科毕业设计/glove/vectors.txt")
|
||||
|
||||
logging.debug("Loading GloVe pre-trained embeddings from %s" % glove_path)
|
||||
|
||||
f = open(os.path.join(glove_dir, glove_path))
|
||||
for line in f:
|
||||
values = line.split()
|
||||
word = values[0]
|
||||
coefs = np.asarray(values[1:], dtype='float32')
|
||||
embeddings_index[word] = coefs
|
||||
f.close()
|
||||
|
||||
logging.debug('Total embeddings found: %s.' % len(embeddings_index))
|
||||
|
||||
return embeddings_index
|
|
@ -0,0 +1,14 @@
|
|||
import logging
|
||||
|
||||
|
||||
def log_versions():
|
||||
import keras
|
||||
logging.info("Keras version %s" % keras.__version__)
|
||||
import numpy as np
|
||||
logging.info("Numpy version %s" % np.__version__)
|
||||
if keras.backend.backend() == 'theano':
|
||||
import theano
|
||||
logging.info("Theano version %s" % theano.__version__)
|
||||
else:
|
||||
import tensorflow
|
||||
logging.info("Tensorflow version %s" % tensorflow.__version__)
|
|
@ -0,0 +1,32 @@
|
|||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def plot_accuracy(history) :
|
||||
plt.plot(history.history['acc'])
|
||||
plt.plot(history.history['val_acc'])
|
||||
plt.title('Model Accuracy over epochs')
|
||||
plt.ylabel('accuracy')
|
||||
plt.xlabel('epoch')
|
||||
plt.legend(['Training', 'Validation'], loc='upper left')
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_loss(history) :
|
||||
plt.plot(history.history['loss'])
|
||||
plt.plot(history.history['val_loss'])
|
||||
plt.title('Model loss over epochs')
|
||||
plt.ylabel('loss')
|
||||
plt.xlabel('epoch')
|
||||
plt.legend(['Training', 'Validation'], loc='upper left')
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_prf(history) :
|
||||
plt.plot(history.history['precision'])
|
||||
plt.plot(history.history['recall'])
|
||||
plt.plot(history.history['f1'])
|
||||
plt.title('P/R/F1 scores on validation set')
|
||||
plt.ylabel('score')
|
||||
plt.xlabel('epoch')
|
||||
plt.legend(['Precision', 'Recall', 'F1'], loc='upper left')
|
||||
plt.show()
|
|
@ -0,0 +1,258 @@
|
|||
import itertools
|
||||
import numpy as np
|
||||
from nlp import chunker, cleaner
|
||||
|
||||
|
||||
def undo_sequential(output):
|
||||
"""
|
||||
Transforms a 3D one-hot array of the type (documents,token,category)
|
||||
in a 2D array of the type (documents,token_category).
|
||||
|
||||
:param output: a one-hot 3D array
|
||||
:return: a 2D array
|
||||
"""
|
||||
return np.argmax(output,axis=2)
|
||||
|
||||
|
||||
def get_words(docs, selections):
|
||||
"""
|
||||
Gets the selected words in the provided documents.
|
||||
|
||||
:param docs: the document to analyze
|
||||
:param selections: the words selected in the documents
|
||||
:return: a dictionary with the documents and for each a list of
|
||||
the selected words
|
||||
"""
|
||||
i = 0
|
||||
obtained_words = {}
|
||||
for doc, words in docs.items():
|
||||
k = 0
|
||||
obtained_words_doc = []
|
||||
in_word = False
|
||||
for token in selections[i]:
|
||||
if token == 1 and k < len(words):
|
||||
obtained_words_doc.append([words[k]])
|
||||
in_word = True
|
||||
elif token == 2 and k < len(words) and in_word:
|
||||
obtained_words_doc[len(obtained_words_doc) - 1].append(words[k])
|
||||
else:
|
||||
in_word = False
|
||||
k += 1
|
||||
|
||||
# remove duplicate selections
|
||||
obtained_words_doc.sort()
|
||||
obtained_words_doc = list(w for w, _ in itertools.groupby(obtained_words_doc))
|
||||
obtained_words[doc] = obtained_words_doc
|
||||
i += 1
|
||||
|
||||
return obtained_words
|
||||
|
||||
|
||||
def get_top_words(docs,output,words_limit):
|
||||
"""
|
||||
Gets the selected words in the provided documents.
|
||||
|
||||
:param docs: the document to analyze
|
||||
:param output: the output of the network
|
||||
:param words_limit: how many words to extract
|
||||
:return: a dictionary with the documents and for each a list of
|
||||
the selected words
|
||||
"""
|
||||
|
||||
selections = undo_sequential(output)
|
||||
|
||||
i = 0
|
||||
obtained_words = {}
|
||||
for doc, words in docs.items():
|
||||
k = 0
|
||||
obtained_words_doc = []
|
||||
obtained_words_weights = []
|
||||
in_word = False
|
||||
for token in selections[i]:
|
||||
if token == 1 and k < len(words):
|
||||
obtained_words_doc.append([words[k]])
|
||||
obtained_words_weights.append(output[i,k,1])
|
||||
in_word = True
|
||||
elif token == 2 and k < len(words) and in_word:
|
||||
obtained_words_doc[len(obtained_words_doc) - 1].append(words[k])
|
||||
obtained_words_weights[len(obtained_words_weights) - 1] = \
|
||||
obtained_words_weights[len(obtained_words_weights) - 1] + \
|
||||
((output[i,k,2] - obtained_words_weights[len(obtained_words_weights) - 1]) /
|
||||
(len(obtained_words_doc[len(obtained_words_doc) - 1])))
|
||||
|
||||
# We calculate the average at the nth step this way:
|
||||
# If A_i is the average at the ith step and x_i is the ith item of the sequence, then
|
||||
# A_k = A_{k-1} + ((x_k - A_{k-1}) / k)
|
||||
|
||||
else:
|
||||
in_word = False
|
||||
k += 1
|
||||
|
||||
if words_limit < len(obtained_words_doc):
|
||||
# there are more selections than the limit! cut them
|
||||
|
||||
obtained_words_and_scores = {}
|
||||
for index, words in enumerate(obtained_words_doc):
|
||||
obtained_words_and_scores[index] = obtained_words_weights[index]
|
||||
|
||||
sorted_words = sorted(obtained_words_and_scores, key=obtained_words_and_scores.__getitem__,reverse=True)
|
||||
|
||||
|
||||
ok_obtained_words = []
|
||||
cur_word = 0
|
||||
while len(ok_obtained_words) < words_limit and cur_word < len(sorted_words):
|
||||
if obtained_words_doc[sorted_words[cur_word]] not in ok_obtained_words:
|
||||
ok_obtained_words.append(obtained_words_doc[sorted_words[cur_word]])
|
||||
cur_word += 1
|
||||
obtained_words_doc = ok_obtained_words
|
||||
|
||||
else:
|
||||
# just remove duplicate selections
|
||||
obtained_words_doc.sort()
|
||||
obtained_words_doc = list(w for w, _ in itertools.groupby(obtained_words_doc))
|
||||
|
||||
obtained_words[doc] = obtained_words_doc
|
||||
i += 1
|
||||
|
||||
return obtained_words
|
||||
|
||||
|
||||
def get_valid_patterns(answer_set):
|
||||
"""
|
||||
Remove the answers from a set that do NOT match the keyphrase part-of-speech patterns.
|
||||
|
||||
:param answer_set: a dictionary of documents and tokenized keyphrases
|
||||
:return: a dictionary of documents and tokenized keyphrases that match the part-of-speech patterns
|
||||
"""
|
||||
|
||||
doc_filtered = {}
|
||||
|
||||
for doc, kps in answer_set.items():
|
||||
filtered_keyphrases = []
|
||||
for kp in kps:
|
||||
for valid_kp in chunker.extract_valid_tokens(kp):
|
||||
filtered_keyphrases.append(valid_kp)
|
||||
|
||||
# remove duplicates
|
||||
filtered_keyphrases.sort()
|
||||
filtered_keyphrases = list(w for w, _ in itertools.groupby(filtered_keyphrases))
|
||||
doc_filtered[doc] = filtered_keyphrases
|
||||
|
||||
return doc_filtered
|
||||
|
||||
|
||||
def clean_answers(answer_set):
|
||||
"""
|
||||
Cleans the keyphrases by removing the tokens that are not PoS tagged with the allowed tags.
|
||||
|
||||
:param answer_set: a dictionary of documents and tokenized keyphrases
|
||||
:return: a dictionary of documents and their cleaned tokenized keyphrases
|
||||
"""
|
||||
doc_filtered = {}
|
||||
|
||||
for doc, kps in answer_set.items():
|
||||
filtered_keyphrases = []
|
||||
for kp in kps:
|
||||
clean_kp = cleaner.clean_tokens(kp)
|
||||
if clean_kp:
|
||||
filtered_keyphrases.append(clean_kp)
|
||||
|
||||
# 去重
|
||||
filtered_keyphrases.sort()
|
||||
filtered_keyphrases = list(w for w, _ in itertools.groupby(filtered_keyphrases))
|
||||
doc_filtered[doc] = filtered_keyphrases
|
||||
|
||||
return doc_filtered
|
||||
|
||||
|
||||
def get_answers(candidate_tokens,predict_set,predict_result,dictionary):
|
||||
"""
|
||||
Build the dictionary of the selected answer for a QA-based network.
|
||||
|
||||
:param candidate_tokens: the dictionary of the documents and their candidate KPs
|
||||
:param predict_set: the input of the network
|
||||
:param predict_result: the output of the network
|
||||
:param dictionary: the previously-fit word index
|
||||
:return: the dictionary of the selected KPs
|
||||
"""
|
||||
|
||||
# Here the ideas is: we go through the dictionary of the candidates, we find the corresponding
|
||||
# model input, and we add the candidate to the answer set if the model predicted class 1 (i.e. that the candidate
|
||||
# was a correct KP
|
||||
|
||||
# First, get the actual predictions:
|
||||
if np.shape(predict_result)[1] == 1:
|
||||
# If we have just 1 output neuron, reshape and put make the output in 0,1 values
|
||||
predictions_flattened = np.round(np.reshape(predict_result,np.shape(predict_result)[0]))
|
||||
else:
|
||||
# If we're working with categorical output, flatten the (num_samples,2) array to a (num_samples) one
|
||||
# This way transform a 2D array e.g. [[0.6,0.4] ... [0.2,0.8]] to a 1D array e.g. [0...1]
|
||||
predictions_flattened = np.argmax(predict_result, axis=1)
|
||||
|
||||
i = 0
|
||||
answers = {}
|
||||
for doc_id, candidate_list in candidate_tokens.items() :
|
||||
answers[doc_id] = []
|
||||
for candidate in candidate_list:
|
||||
|
||||
# Sanity check: was the order preserved?
|
||||
assert candidate == dictionary.tokens_to_words(predict_set[1][i])
|
||||
|
||||
if predictions_flattened[i] == 1 :
|
||||
answers[doc_id].append(candidate)
|
||||
|
||||
i += 1
|
||||
|
||||
return answers
|
||||
|
||||
|
||||
def get_top_answers(candidate_tokens,predict_set,predict_result,dictionary,limit):
|
||||
"""
|
||||
Build the dictionary of the selected answer for a QA-based network.
|
||||
|
||||
:param candidate_tokens: the dictionary of the documents and their candidate KPs
|
||||
:param predict_set: the input of the network
|
||||
:param predict_result: the output of the network
|
||||
:param dictionary: the previously-fit word index
|
||||
:return: the dictionary of the selected KPs
|
||||
"""
|
||||
|
||||
# Here the ideas is: we go through the dictionary of the candidates, we find the corresponding
|
||||
# model input, and we add the candidate to the answer set if the model predicted class 1 (i.e. that the candidate
|
||||
# was a correct KP
|
||||
|
||||
# First, get the actual predictions:
|
||||
if np.shape(predict_result)[1] == 1:
|
||||
# If we have just 1 output neuron, reshape and put the output in 0,1 values
|
||||
predictions_flattened = np.round(np.reshape(predict_result,np.shape(predict_result)[0]))
|
||||
else:
|
||||
# If we're working with categorical output, flatten the (num_samples,2) array to a (num_samples) one
|
||||
# This way transform a 2D array e.g. [[0.6,0.4] ... [0.2,0.8]] to a 1D array e.g. [0...1]
|
||||
predictions_flattened = np.argmax(predict_result, axis=1)
|
||||
|
||||
i = 0
|
||||
answers = {}
|
||||
scores = {}
|
||||
for doc_id, candidate_list in candidate_tokens.items() :
|
||||
answers[doc_id] = []
|
||||
scores[doc_id] = []
|
||||
for candidate in candidate_list:
|
||||
|
||||
# Sanity check: was the order preserved?
|
||||
assert candidate == dictionary.tokens_to_words(predict_set[1][i])
|
||||
|
||||
if predictions_flattened[i] == 1 :
|
||||
answers[doc_id].append(candidate)
|
||||
if np.shape(predict_result)[1] == 1:
|
||||
scores[doc_id].append(predict_result[i][0])
|
||||
else:
|
||||
scores[doc_id].append(predict_result[i][1])
|
||||
|
||||
i += 1
|
||||
|
||||
if len(answers[doc_id]) > limit :
|
||||
answers[doc_id] = [x for _,x in sorted(zip(scores[doc_id],answers[doc_id]),reverse=True)][:limit]
|
||||
|
||||
return answers
|
||||
|
||||
|
|
@ -0,0 +1,560 @@
|
|||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.utils import np_utils
|
||||
from utils import glove
|
||||
from nlp import dictionary as dict
|
||||
import logging
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
|
||||
def prepare_answer(train_doc, train_answer, train_candidates,
|
||||
test_doc, test_answer, test_candidates,
|
||||
val_doc=None, val_answer=None, val_candidates=None,
|
||||
max_document_length=1000,
|
||||
max_answer_length=20,
|
||||
max_vocabulary_size=50000,
|
||||
embeddings_size=50):
|
||||
"""
|
||||
Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
|
||||
previously for the training, test and validation sets as candidate for all three sets.
|
||||
|
||||
:param train_doc: the training documents
|
||||
:param train_answer: the KPs for the training documents
|
||||
:param train_candidates: the candidate KPs for the training documents
|
||||
:param test_doc: the test documents
|
||||
:param test_answer: the KPs for the test documents
|
||||
:param test_candidates: the candidate KPs for the test documents
|
||||
:param val_doc: the validation documents (can be None)
|
||||
:param val_answer: the KPs for the validation documents (can be None)
|
||||
:param val_candidates: the candidate KPs for the validation documents (can be None)
|
||||
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
|
||||
:param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
|
||||
:param max_vocabulary_size: the maximum size of the vocabulary to use
|
||||
(i.e. we keep only the top max_vocabulary_size words)
|
||||
:param embeddings_size: the size of the GLoVE embeddings to use
|
||||
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
|
||||
test and validation set, and an embedding matrix for an Embedding layer
|
||||
"""
|
||||
|
||||
# Prepare validation return data
|
||||
val_q = None
|
||||
val_a = None
|
||||
val_y = None
|
||||
|
||||
# Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
|
||||
train_q = []
|
||||
test_q = []
|
||||
train_a = []
|
||||
test_a = []
|
||||
train_y = []
|
||||
test_y = []
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_q = []
|
||||
val_a = []
|
||||
val_y = []
|
||||
|
||||
documents_full = []
|
||||
for key, doc in train_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
for key, doc in test_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
|
||||
if val_doc and val_answer:
|
||||
for key, doc in val_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
|
||||
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
|
||||
|
||||
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
|
||||
dictionary.fit_on_texts(documents_full)
|
||||
|
||||
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
|
||||
|
||||
# Pair up each document with a candidate keyphrase and its truth value
|
||||
for key, document in train_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
for kp in train_candidates[key]:
|
||||
train_q.append(doc_sequence)
|
||||
train_a.append(dictionary.token_list_to_sequence(kp))
|
||||
train_y.append([0, 1] if kp in train_answer[key] else [1, 0])
|
||||
|
||||
for key, document in test_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
for kp in test_candidates[key]:
|
||||
test_q.append(doc_sequence)
|
||||
test_a.append(dictionary.token_list_to_sequence(kp))
|
||||
test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
|
||||
|
||||
if val_doc and val_answer:
|
||||
for key, document in val_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
for kp in val_candidates[key]:
|
||||
val_q.append(doc_sequence)
|
||||
val_a.append(dictionary.token_list_to_sequence(kp))
|
||||
val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
|
||||
|
||||
logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len)))
|
||||
logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len)))
|
||||
logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len)))
|
||||
logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len)))
|
||||
if val_doc and val_answer:
|
||||
logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
|
||||
logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len)))
|
||||
|
||||
train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
|
||||
test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
|
||||
logging.debug("Training set documents size : %s", np.shape(train_q))
|
||||
logging.debug("Training set answers size : %s", np.shape(train_a))
|
||||
logging.debug("Test set documents size : %s", np.shape(test_q))
|
||||
logging.debug("Test set answers size : %s ", np.shape(test_a))
|
||||
|
||||
if val_doc and val_answer:
|
||||
logging.debug("Validation set documents size : %s", np.shape(val_q))
|
||||
logging.debug("Validation set answers size : %s ", np.shape(val_a))
|
||||
|
||||
# prepare the matrix for the embedding layer
|
||||
word_index = dictionary.word_index
|
||||
embeddings_index = glove.load_glove('', embeddings_size)
|
||||
|
||||
num_words = min(max_vocabulary_size, 1 + len(word_index))
|
||||
|
||||
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
|
||||
|
||||
embedding_matrix = np.zeros((num_words, embeddings_size))
|
||||
for word, i in word_index.items():
|
||||
if i >= num_words:
|
||||
continue
|
||||
embedding_vector = embeddings_index.get(word)
|
||||
if embedding_vector is not None:
|
||||
# words not found in embedding index will be all-zeros.
|
||||
embedding_matrix[i] = embedding_vector
|
||||
|
||||
return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, embedding_matrix, dictionary
|
||||
|
||||
|
||||
def prepare_answer_2(train_doc, train_answer, train_candidates,
|
||||
test_doc, test_answer, test_candidates,
|
||||
val_doc=None, val_answer=None, val_candidates=None,
|
||||
max_document_length=1000,
|
||||
max_answer_length=20,
|
||||
max_vocabulary_size=50000,
|
||||
embeddings_size=50):
|
||||
"""
|
||||
Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
|
||||
previously for the test and validation sets as candidate for these sets, and mix the correct answers with
|
||||
wrong patterns on the training set to build in order to have balanced data for training.
|
||||
|
||||
:param train_doc: the training documents
|
||||
:param train_answer: the KPs for the training documents
|
||||
:param train_candidates: the candidate KPs for the training documents
|
||||
:param test_doc: the test documents
|
||||
:param test_answer: the KPs for the test documents
|
||||
:param test_candidates: the candidate KPs for the test documents
|
||||
:param val_doc: the validation documents (can be None)
|
||||
:param val_answer: the KPs for the validation documents (can be None)
|
||||
:param val_candidates: the candidate KPs for the validation documents (can be None)
|
||||
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
|
||||
:param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
|
||||
:param max_vocabulary_size: the maximum size of the vocabulary to use
|
||||
(i.e. we keep only the top max_vocabulary_size words)
|
||||
:param embeddings_size: the size of the GLoVE embeddings to use
|
||||
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
|
||||
test and validation set, and an embedding matrix for an Embedding layer
|
||||
"""
|
||||
|
||||
# Prepare validation return data
|
||||
val_q = None
|
||||
val_a = None
|
||||
val_y = None
|
||||
|
||||
val_q_balanced = None
|
||||
val_a_balanced = None
|
||||
val_y_balanced = None
|
||||
|
||||
# Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
|
||||
train_q = []
|
||||
test_q = []
|
||||
train_a = []
|
||||
test_a = []
|
||||
train_y = []
|
||||
test_y = []
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_q = []
|
||||
val_a = []
|
||||
val_y = []
|
||||
val_q_balanced = []
|
||||
val_a_balanced = []
|
||||
val_y_balanced = []
|
||||
|
||||
documents_full = []
|
||||
for key, doc in train_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
for key, doc in test_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
|
||||
if val_doc and val_answer:
|
||||
for key, doc in val_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
|
||||
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
|
||||
|
||||
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
|
||||
dictionary.fit_on_texts(documents_full)
|
||||
|
||||
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
|
||||
|
||||
# Pair up each document with a candidate keyphrase and its truth value
|
||||
for key, document in train_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
|
||||
# select wrong candidates (possibly, in same quantity as good answers)
|
||||
wrong_candidates = list(train_candidates[key])
|
||||
for answer in train_answer[key]:
|
||||
if answer in wrong_candidates:
|
||||
wrong_candidates.remove(answer)
|
||||
|
||||
while len(wrong_candidates) > len(train_answer[key]):
|
||||
random_candidate = random.choice(wrong_candidates)
|
||||
wrong_candidates.remove(random_candidate)
|
||||
|
||||
# append wrong candidates
|
||||
for kp in wrong_candidates:
|
||||
train_q.append(doc_sequence)
|
||||
train_a.append(dictionary.token_list_to_sequence(kp))
|
||||
train_y.append([1, 0])
|
||||
|
||||
# append true answers
|
||||
for kp in train_answer[key]:
|
||||
train_q.append(doc_sequence)
|
||||
train_a.append(dictionary.token_list_to_sequence(kp))
|
||||
train_y.append([0, 1])
|
||||
|
||||
if val_doc and val_answer:
|
||||
for key, document in val_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
|
||||
# select wrong candidates (possibly, in same quantity as good answers)
|
||||
wrong_candidates = list(val_candidates[key])
|
||||
for answer in val_answer[key]:
|
||||
if answer in wrong_candidates:
|
||||
wrong_candidates.remove(answer)
|
||||
|
||||
while len(wrong_candidates) > len(val_answer[key]):
|
||||
random_candidate = random.choice(wrong_candidates)
|
||||
wrong_candidates.remove(random_candidate)
|
||||
|
||||
# append wrong candidates
|
||||
for kp in wrong_candidates:
|
||||
val_q_balanced.append(doc_sequence)
|
||||
val_a_balanced.append(dictionary.token_list_to_sequence(kp))
|
||||
val_y_balanced.append([1, 0])
|
||||
|
||||
# append true answers
|
||||
for kp in val_answer[key]:
|
||||
val_q_balanced.append(doc_sequence)
|
||||
val_a_balanced.append(dictionary.token_list_to_sequence(kp))
|
||||
val_y_balanced.append([0, 1])
|
||||
|
||||
# for the other sets, just pick the auto-generated candidates
|
||||
for key, document in test_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
for kp in test_candidates[key]:
|
||||
test_q.append(doc_sequence)
|
||||
test_a.append(dictionary.token_list_to_sequence(kp))
|
||||
test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
|
||||
|
||||
if val_doc and val_answer:
|
||||
for key, document in val_doc.items():
|
||||
doc_sequence = dictionary.token_list_to_sequence(document)
|
||||
for kp in val_candidates[key]:
|
||||
val_q.append(doc_sequence)
|
||||
val_a.append(dictionary.token_list_to_sequence(kp))
|
||||
val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
|
||||
|
||||
logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len)))
|
||||
logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len)))
|
||||
logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len)))
|
||||
logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len)))
|
||||
if val_doc and val_answer:
|
||||
logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
|
||||
logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len)))
|
||||
logging.debug("Longest balanced validation document : %s tokens" % len(max(val_q, key=len)))
|
||||
logging.debug("Longest balanced validation answer : %s tokens" % len(max(val_a, key=len)))
|
||||
|
||||
train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
|
||||
test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
val_q_balanced = np.asarray(pad_sequences(val_q_balanced, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
val_a_balanced = np.asarray(pad_sequences(val_a_balanced, maxlen=max_answer_length, padding='post', truncating='post'))
|
||||
|
||||
logging.debug("Training set documents size : %s", np.shape(train_q))
|
||||
logging.debug("Training set answers size : %s", np.shape(train_a))
|
||||
logging.debug("Test set documents size : %s", np.shape(test_q))
|
||||
logging.debug("Test set answers size : %s ", np.shape(test_a))
|
||||
|
||||
if val_doc and val_answer:
|
||||
logging.debug("Validation set documents size : %s", np.shape(val_q))
|
||||
logging.debug("Validation set answers size : %s ", np.shape(val_a))
|
||||
logging.debug("Balanced Validation set documents size : %s", np.shape(val_q_balanced))
|
||||
logging.debug("Balanced Validation set answers size : %s ", np.shape(val_a_balanced))
|
||||
|
||||
# prepare the matrix for the embedding layer
|
||||
word_index = dictionary.word_index
|
||||
embeddings_index = glove.load_glove('', embeddings_size)
|
||||
|
||||
num_words = min(max_vocabulary_size, 1 + len(word_index))
|
||||
|
||||
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
|
||||
|
||||
embedding_matrix = np.zeros((num_words, embeddings_size))
|
||||
for word, i in word_index.items():
|
||||
if i >= num_words:
|
||||
continue
|
||||
embedding_vector = embeddings_index.get(word)
|
||||
if embedding_vector is not None:
|
||||
# words not found in embedding index will be all-zeros.
|
||||
embedding_matrix[i] = embedding_vector
|
||||
|
||||
return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, \
|
||||
[val_q_balanced, val_a_balanced], val_y_balanced, embedding_matrix, dictionary
|
||||
|
||||
|
||||
def prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
|
||||
max_document_length=35000,
|
||||
max_vocabulary_size=5000,
|
||||
embeddings_size=50,
|
||||
stem_test = False):
|
||||
"""
|
||||
Prepares a dataset for use by a sequential, categorical model.
|
||||
|
||||
:param train_doc: the training documents
|
||||
:param train_answer: the KPs for the training documents
|
||||
:param test_doc: the test documents
|
||||
:param test_answer: the KPs for the test documents
|
||||
:param val_doc: the validation documents (can be None)
|
||||
:param val_answer: the KPs for the validation documents (can be None)
|
||||
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
|
||||
:param max_vocabulary_size: the maximum size of the vocabulary to use
|
||||
(i.e. we keep only the top max_vocabulary_size words)
|
||||
:param embeddings_size: the size of the GLoVE embeddings to use
|
||||
:param stem_test: set the value to True if the test set answers are stemmed
|
||||
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
|
||||
test and validation set, and an embedding matrix for an Embedding layer
|
||||
"""
|
||||
|
||||
train_answer_seq = make_sequential(train_doc, train_answer)
|
||||
|
||||
if not stem_test:
|
||||
test_answer_seq = make_sequential(test_doc, test_answer)
|
||||
else:
|
||||
import copy
|
||||
stemmed_test_doc = copy.deepcopy(test_doc)
|
||||
stemmed_test_doc = stem_dataset(stemmed_test_doc)
|
||||
test_answer_seq = make_sequential(stemmed_test_doc,test_answer)
|
||||
|
||||
# Prepare validation return data
|
||||
val_x = None
|
||||
val_y = None
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_answer_seq = make_sequential(val_doc, val_answer)
|
||||
|
||||
# Transform the documents to sequence
|
||||
documents_full = []
|
||||
train_y = []
|
||||
test_y = []
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_y = []
|
||||
|
||||
for key, doc in train_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
train_y.append(train_answer_seq[key])
|
||||
for key, doc in test_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
test_y.append(test_answer_seq[key])
|
||||
|
||||
if val_doc and val_answer:
|
||||
for key, doc in val_doc.items():
|
||||
documents_full.append(token for token in doc)
|
||||
val_y.append(val_answer_seq[key])
|
||||
|
||||
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
|
||||
|
||||
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
|
||||
dictionary.fit_on_texts(documents_full)
|
||||
|
||||
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
|
||||
|
||||
# Now we can prepare the actual input
|
||||
train_x = dictionary.texts_to_sequences(train_doc.values())
|
||||
test_x = dictionary.texts_to_sequences(test_doc.values())
|
||||
if val_doc and val_answer:
|
||||
val_x = dictionary.texts_to_sequences(val_doc.values())
|
||||
|
||||
logging.debug("Longest training document : %s tokens" % len(max(train_x, key=len)))
|
||||
logging.debug("Longest test document : %s tokens" % len(max(test_x, key=len)))
|
||||
if val_doc and val_answer:
|
||||
logging.debug("Longest validation document : %s tokens" % len(max(val_x, key=len)))
|
||||
|
||||
train_x = np.asarray(pad_sequences(train_x, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
train_y = pad_sequences(train_y, maxlen=max_document_length, padding='post', truncating='post')
|
||||
train_y = make_categorical(train_y)
|
||||
|
||||
test_x = np.asarray(pad_sequences(test_x, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
test_y = pad_sequences(test_y, maxlen=max_document_length, padding='post', truncating='post')
|
||||
test_y = make_categorical(test_y)
|
||||
|
||||
if val_doc and val_answer:
|
||||
val_x = np.asarray(pad_sequences(val_x, maxlen=max_document_length, padding='post', truncating='post'))
|
||||
val_y = pad_sequences(val_y, maxlen=max_document_length, padding='post', truncating='post')
|
||||
val_y = make_categorical(val_y)
|
||||
|
||||
logging.debug("Training set samples size : %s", np.shape(train_x))
|
||||
logging.debug("Training set answers size : %s", np.shape(train_y))
|
||||
logging.debug("Test set samples size : %s", np.shape(test_x))
|
||||
logging.debug("Test set answers size : %s ", np.shape(test_y))
|
||||
|
||||
if val_doc and val_answer:
|
||||
logging.debug("Validation set samples size : %s", np.shape(val_x))
|
||||
logging.debug("Validation set answers size : %s ", np.shape(val_y))
|
||||
|
||||
# prepare the matrix for the embedding layer
|
||||
word_index = dictionary.word_index
|
||||
embeddings_index = glove.load_glove('', embeddings_size)
|
||||
|
||||
num_words = min(max_vocabulary_size, 1 + len(word_index))
|
||||
|
||||
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
|
||||
|
||||
embedding_matrix = np.zeros((num_words, embeddings_size))
|
||||
for word, i in word_index.items():
|
||||
if i >= num_words:
|
||||
continue
|
||||
embedding_vector = embeddings_index.get(word)
|
||||
if embedding_vector is not None:
|
||||
# words not found in embedding index will be all-zeros.
|
||||
embedding_matrix[i] = embedding_vector
|
||||
|
||||
return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix
|
||||
|
||||
|
||||
def make_sequential(documents, answers):
|
||||
"""
|
||||
Transform an answer-based dataset (i.e. with a list of
|
||||
documents and a list of keyphrases) to a sequential, ner-like
|
||||
dataset, i.e. where the answer set for each document is composed
|
||||
by the lists of the documents' tokens marked as non-keyphrase (0),
|
||||
beginning of keyphrase (1) and inside-keyphrase (2).
|
||||
|
||||
For example, for the tokens
|
||||
|
||||
"I am a python developer since today."
|
||||
|
||||
If the keyphrases are "python developer" and "today"" the answer
|
||||
set for these tokens is
|
||||
|
||||
"[0 0 0 1 2 0 1]"
|
||||
|
||||
:param documents: the list of documents
|
||||
:param answers: the list of keyphrases
|
||||
:return: the new answer set
|
||||
"""
|
||||
|
||||
seq_answers = {}
|
||||
|
||||
for key, document in documents.items():
|
||||
doc_answers_set = answers[key]
|
||||
'''
|
||||
按关键字的长度排序。 我们首先处理较短的 KP
|
||||
如果它们包含在更长的 KP 中,将简单地覆盖
|
||||
短的配长的
|
||||
'''
|
||||
doc_answers_set.sort(key=lambda a: len(a))
|
||||
|
||||
'''
|
||||
该字段将包含答案。
|
||||
我们将它初始化为一个零列表,然后我们将填充它
|
||||
1s 和 2s 之后
|
||||
'''
|
||||
doc_answers_seq = [0] * len(document)
|
||||
|
||||
for answer in doc_answers_set:
|
||||
# 查找 KP 出现的第一个单词的位置
|
||||
appearances = [i for i, word in enumerate(document) if word == answer[0]]
|
||||
for idx in appearances:
|
||||
is_kp = True
|
||||
# 检查 KP 是否也从它的第二个词开始匹配
|
||||
for i in range(1, len(answer)):
|
||||
|
||||
if (i + idx) < len(document):
|
||||
is_kp = answer[i] == document[i + idx]
|
||||
else:
|
||||
# 文档结尾
|
||||
is_kp = False
|
||||
|
||||
# 如果我们找到了实际的 KP,请在输出列表中标记标记。
|
||||
if is_kp:
|
||||
doc_answers_seq[idx] = 1
|
||||
for i in range(1, len(answer)):
|
||||
doc_answers_seq[idx + i] = 2
|
||||
|
||||
|
||||
seq_answers[key] = doc_answers_seq
|
||||
|
||||
return seq_answers
|
||||
|
||||
|
||||
def make_categorical(x):
|
||||
"""
|
||||
Transform a two-dimensional list into a 3-dimensional array. The 2nd
|
||||
dimension of the input list becomes a one-hot 2D array, e.g.
|
||||
if the input is [[1,2,0],...], the output will be
|
||||
[[[0,1,0],[0,0,1],[1,0,0]],...]
|
||||
|
||||
:param x: a 2D-list
|
||||
:return: a 3D-numpy array
|
||||
"""
|
||||
|
||||
# 类别数量
|
||||
num_categories = max([item for sublist in x for item in sublist]) + 1
|
||||
|
||||
# numpy格式输出
|
||||
new_x = np.zeros((len(x), len(x[0]), num_categories))
|
||||
|
||||
# 使用keras进行实际的分类转换
|
||||
i = 0
|
||||
for doc in x:
|
||||
new_doc = np_utils.to_categorical(doc, num_classes=num_categories)
|
||||
new_x[i] = new_doc
|
||||
i += 1
|
||||
|
||||
return new_x
|
||||
|
||||
|
||||
def stem_dataset(dataset):
|
||||
|
||||
from nltk.stem import PorterStemmer
|
||||
stemmer = PorterStemmer()
|
||||
|
||||
for key, tokens in dataset.items():
|
||||
stemmed_tokens = [stemmer.stem(token) for token in tokens]
|
||||
dataset[key] = stemmed_tokens
|
||||
|
||||
return dataset
|
Loading…
Reference in New Issue