first commit

This commit is contained in:
zhangxiaojun 2023-04-19 13:16:27 +08:00
parent 850c1fc3f4
commit 7e701970c7
15 changed files with 75814 additions and 0 deletions

173
Bi-LSTM.py Normal file
View File

@ -0,0 +1,173 @@
import os,sys
os.chdir(sys.path[0])
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from datasets import EnNews
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import numpy as np
import random as rn
np.random.seed(421)
rn.seed(12345)
import logging
from keras import regularizers
from keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM, TimeDistributed
from keras.models import Sequential, load_model
from datasets import *
from eval import keras_metrics, metrics
from nlp import tokenizer as tk
from utils import info, preprocessing, postprocessing, plots
# 记录配置
logging.basicConfig(
format='%(asctime)s\t%(levelname)s\t%(message)s',
level=logging.DEBUG)
info.log_versions()
# 全局变量
SAVE_MODEL = False
MODEL_PATH = "models/bilstm.h5"
SHOW_PLOTS = False
# 数据集和超参数
Dataset = EnNews
rootpath = "/home/zhangxj/WorkFile/本科毕业设计"
tokenizer = tk.tokenizers.nltk
DATASET_FOLDER = rootpath+"/EnergyNews"
MAX_DOCUMENT_LENGTH = 400
MAX_VOCABULARY_SIZE = 20000
EMBEDDINGS_SIZE = 50
batch_size = 32
epochs = 20
KP_WEIGHT = 10
STEM_MODE = metrics.stemMode.both
STEM_TEST = False
# 加载数据集
logging.info("Loading dataset...")
data = Dataset(DATASET_FOLDER)
train_doc_str, train_answer_str = data.load_train()
test_doc_str, test_answer_str = data.load_test()
val_doc_str, val_answer_str = data.load_validation()
train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str, tokenizer)
test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str, tokenizer)
val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer)
# 完整性检查
logging.info("Dataset loaded. Preprocessing data...")
train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix = preprocessing. \
prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
max_document_length=MAX_DOCUMENT_LENGTH,
max_vocabulary_size=MAX_VOCABULARY_SIZE,
embeddings_size=EMBEDDINGS_SIZE,
stem_test=STEM_TEST)
# 权重训练示例:所有不是 kp的内容
from sklearn.utils import class_weight
train_y_weights = np.argmax(train_y, axis=2)
train_y_weights = np.reshape(class_weight.compute_sample_weight('balanced', train_y_weights.flatten()),
np.shape(train_y_weights))
logging.info("数据预处理完成")
logging.info("可能的最大召回率: %s",
metrics.recall(test_answer,
postprocessing.get_words(test_doc, postprocessing.undo_sequential(test_y)),
STEM_MODE))
if not SAVE_MODEL or not os.path.isfile(MODEL_PATH):
logging.debug("建立网络...")
model = Sequential()
print("-------",np.shape(embedding_matrix)[0])
embedding_layer = Embedding(np.shape(embedding_matrix)[0],
EMBEDDINGS_SIZE,
weights=[embedding_matrix],
input_length=MAX_DOCUMENT_LENGTH,
trainable=False)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(300, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01))))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(2, activation='softmax')))
logging.info("编译网络...")
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
sample_weight_mode="temporal")
print(model.summary())
metrics_callback = keras_metrics.MetricsCallback(val_x, val_y)
logging.info("拟合网络...")
history = model.fit(train_x, train_y,
validation_data=(val_x, val_y),
epochs=epochs,
batch_size=batch_size,
sample_weight=train_y_weights,
callbacks=[metrics_callback])
if SHOW_PLOTS:
plots.plot_accuracy(history)
plots.plot_loss(history)
plots.plot_prf(metrics_callback)
if SAVE_MODEL:
model.save(MODEL_PATH)
logging.info("模型保存路径 in %s", MODEL_PATH)
else:
logging.info("加载模型 %s...", MODEL_PATH)
model = load_model(MODEL_PATH)
logging.info("加载模型完成")
logging.info("在测试集上预测...")
output = model.predict(x=test_x, verbose=1)
logging.debug("输出格式: %s", np.shape(output))
obtained_tokens = postprocessing.undo_sequential(output)
obtained_words = postprocessing.get_words(test_doc, obtained_tokens)
precision = metrics.precision(test_answer, obtained_words,STEM_MODE)
recall = metrics.recall(test_answer, obtained_words,STEM_MODE)
f1 = metrics.f1(precision, recall)
print("### 获得的分数 ###")
print("###")
print("### Precision : %.4f" % precision)
print("### Recall : %.4f" % recall)
print("### F1 : %.4f" % f1)
print("### ###")
keras_precision = keras_metrics.keras_precision(test_y, output)
keras_recall = keras_metrics.keras_recall(test_y, output)
keras_f1 = keras_metrics.keras_f1(test_y, output)
print("### 获得的分数 ###")
print("###")
print("### Precision : %.4f" % keras_precision)
print("### Recall : %.4f" % keras_recall)
print("### F1 : %.4f" % keras_f1)
print("### ###")

209
datasets.py Normal file
View File

@ -0,0 +1,209 @@
import logging
import os
from nlp import tokenizer as tk
class Dataset(object):
"""
An abstract class that represents a dataset.
"""
def __init__(self, name, path):
self.path = path
self.name = name
self.test_documents = None
self.test_answers = None
self.train_documents = None
self.train_answers = None
self.validation_documents = None
self.validation_answers = None
logging.debug("初始化数据集 %s 文件夹路径 %s" %
(self.name, self.path))
def __str__(self):
return '数据集 %s 所在路径 %s' % (self.name, self.path)
def _load_test_documents(self):
"""
Loads the test documents.
:return: a list of documents.
"""
raise NotImplementedError
def _load_test_answers(self):
"""
Loads the answers for the test documents.
:return: a list of answers.
"""
raise NotImplementedError
def _load_train_documents(self):
"""
Loads the train documents.
:return: a list of documents.
"""
raise NotImplementedError
def _load_train_answers(self):
"""
Loads the answers for the train documents.
:return: a list of answers.
"""
raise NotImplementedError
def _load_validation_documents(self):
"""
Loads the validation documents.
:return: a list of documents.
"""
raise NotImplementedError
def _load_validation_answers(self):
"""
Loads the answers for the validation documents.
:return: a list of answers.
"""
raise NotImplementedError
def load_test(self):
"""
Loads the test documents and their answers.
:return: a tuple containing the test documents and the test answers.
"""
if not self.test_documents:
self.test_documents = self._load_test_documents()
if not self.test_answers:
self.test_answers = self._load_test_answers()
assert (len(self.test_documents) == len(self.test_answers)), \
"You have not enough (or too many) test answers for your documents!"
logging.debug("为数据集加载测试集 %s" % self.name)
return self.test_documents, self.test_answers
def load_train(self):
"""
Loads the training documents and their answers.
:return: a tuple containing the train documents and the training answers.
"""
if not self.train_documents:
self.train_documents = self._load_train_documents()
if not self.train_answers:
self.train_answers = self._load_train_answers()
assert (len(self.train_documents) == len(self.train_answers)), \
"You have not enough (or too many) train answers for your documents!"
logging.debug("为数据集加载训练集 %s" % self.name)
return self.train_documents, self.train_answers
def load_validation(self):
"""
Loads the validation documents and their answers.
:return: a tuple containing the validation documents and the training answers.
"""
if not self.validation_documents:
self.validation_documents = self._load_validation_documents()
if not self.validation_answers:
self.validation_answers = self._load_validation_answers()
assert (not self.validation_answers and not self.validation_answers) or \
(len(self.validation_documents) == len(self.validation_answers)), \
"You have not enough (or too many) validation answers for your documents!"
logging.debug("为数据集加载验证集 %s" % self.name)
return self.validation_documents, self.validation_answers
class EnNews(Dataset):
"""
Dataset from Annette Hulth's "Improved Automatic Keyword Extraction
Given More Linguistic Knowledge"
Note: to make the results obtained with this dataset comparable to
the ones described in Hulth's paper, only the "uncontrolled" terms
are used.
Full-text here: http://www.aclweb.org/anthology/W03-1028
"""
def __init__(self, path):
super().__init__("EnergyNews", path)
def __load_documents(self, folder):
"""
Loads the documents in the .abstr files contained
in the specified folder and puts them in a dictionary
indexed by document id (i.e. the filename without the
extension).
:param folder: the folder containing the documents
:return: a dictionary with the documents
"""
# This dictionary will contain the documents
documents = {}
for doc in os.listdir("%s/%s" % (self.path, folder)):
if doc.endswith(".clr"):
content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
documents[doc[:doc.find('.')]] = content
return documents
def __load_answers(self, folder):
"""
Loads the answers contained in the .contr and .uncontr files
and puts them in a dictionary indexed by document ID
(i.e. the document name without the extension)
:param folder: the folder containing the answer files
:return: a dictionary with the answers
"""
# This dictionary will contain the answers
answers = {}
for doc in os.listdir("%s/%s" % (self.path, folder)):
if doc.endswith(".key"):
content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
retrieved_answers = content.split(' ')
doc_id = doc[:doc.find('.')]
for answer in retrieved_answers:
answer = answer.strip() # 移除字符串头尾指定的字符(默认为空格或换行符)
if doc_id not in answers:
answers[doc_id] = [answer]
else:
answers[doc_id].append(answer)
return answers
def _load_test_documents(self):
return self.__load_documents("test")
def _load_train_documents(self):
return self.__load_documents("train")
def _load_validation_documents(self):
return self.__load_documents("validation")
def _load_test_answers(self):
return self.__load_answers("test")
def _load_train_answers(self):
return self.__load_answers("train")
def _load_validation_answers(self):
return self.__load_answers("validation")

31
eval/anno_generator.py Normal file
View File

@ -0,0 +1,31 @@
import os
def write_anno(output_folder, documents, keyphrases):
# create output directory if not exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for doc_id, doc_string in documents.items():
i = 0
output_file = open("%s/%s.%s" % (output_folder, doc_id, "ann"), "w")
for kp in keyphrases[doc_id]:
kp_string = ' '.join(kp)
for start_index in list(find_all(doc_string, kp_string)):
end_index = start_index + len(kp_string)
output_file.write("T%s\t%s %s %s\t%s\n" %
(i, "NO_TYPE", start_index, end_index, kp_string))
output_file.close()
def find_all(target_string, substring):
start = 0
while True:
start = target_string.find(substring, start)
if start == -1: return
yield start
start += 1

220
eval/keras_metrics.py Normal file
View File

@ -0,0 +1,220 @@
import keras
import numpy as np
import logging
class MetricsCallback(keras.callbacks.Callback):
def __init__(self,val_x,val_y):
self.val_x = val_x
self.val_y = val_y
self.epoch = []
self.history = {}
def on_epoch_end(self, epoch, logs={}):
# Predict on the validation data
y_pred = self.model.predict(self.val_x)
precision = keras_precision(self.val_y,y_pred)
recall = keras_recall(self.val_y, y_pred)
f1 = keras_f1(self.val_y, y_pred)
print("")
print("### Validation Scores ###")
print("###")
print("### Epoch : %s" % (epoch+1))
print("### Precision : %.4f" % precision)
print("### Recall : %.4f" % recall)
print("### F1 : %.4f" % f1)
print("### ###")
self.epoch.append(epoch+1)
self.history.setdefault("precision", []).append(precision)
self.history.setdefault("recall", []).append(recall)
self.history.setdefault("f1", []).append(f1)
class MetricsCallbackQA(keras.callbacks.Callback):
def __init__(self,val_x,val_y,batch_size = 128):
self.val_x = val_x
self.val_y = val_y
self.epoch = []
self.history = {}
self.batch_size = batch_size
def on_epoch_end(self, epoch, logs={}):
# Predict on the validation data
y_pred = self.model.predict(self.val_x,batch_size=self.batch_size,verbose=1)
precision = keras_precision_qa(self.val_y,y_pred)
recall = keras_recall_qa(self.val_y, y_pred)
f1 = keras_f1_qa(self.val_y, y_pred)
print("")
print("### Validation Scores ###")
print("###")
print("### Epoch : %s" % (epoch+1))
print("### Precision : %.4f" % precision)
print("### Recall : %.4f" % recall)
print("### F1 : %.4f" % f1)
print("### ###")
self.epoch.append(epoch+1)
self.history.setdefault("precision", []).append(precision)
self.history.setdefault("recall", []).append(recall)
self.history.setdefault("f1", []).append(f1)
def keras_precision(y_true,y_pred) :
true_positives = 0
false_positives = 0
# reduce dimensionality
y_true_2d = np.argmax(y_true,axis=2)
y_pred_2d = np.argmax(y_pred,axis=2)
y_true_indices = {}
for i in range(np.shape(y_true_2d)[0]):
doc_true_indices = []
in_word = False
for j in range(np.shape(y_true_2d)[1]):
if y_true_2d[i][j] == 1 :
doc_true_indices.append(["%s" % j])
in_word = True
elif j > 0 and y_true_2d[i][j] == 2 and in_word:
doc_true_indices[len(doc_true_indices) -1].append(",%s" % j)
else:
in_word = False
y_true_indices[i] = doc_true_indices
y_pred_indices = {}
for i in range(np.shape(y_pred_2d)[0]):
doc_true_indices = []
in_word = False
for j in range(np.shape(y_pred_2d)[1]):
if y_pred_2d[i][j] == 1:
doc_true_indices.append(["%s" % j])
in_word = True
elif j > 0 and y_pred_2d[i][j] == 2 and in_word:
doc_true_indices[len(doc_true_indices) - 1].append(",%s" % j)
else :
in_word = False
y_pred_indices[i] = doc_true_indices
for i in range(len(y_pred_indices)) :
for kp in y_pred_indices[i]:
if kp in y_true_indices[i]:
true_positives += 1
else :
false_positives += 1
return (1.0 * true_positives) / (true_positives + false_positives) \
if true_positives + false_positives > 0 else 0
def keras_recall(y_true,y_pred) :
true_positives = 0
false_positives = 0
# reduce dimensionality
y_true_2d = np.argmax(y_true,axis=2)
y_pred_2d = np.argmax(y_pred,axis=2)
y_true_indices = {}
for i in range(np.shape(y_true_2d)[0]):
doc_true_indices = []
in_word = False
for j in range(np.shape(y_true_2d)[1]):
if y_true_2d[i][j] == 1 :
doc_true_indices.append(["%s" % j])
in_word = True
elif j > 0 and y_true_2d[i][j] == 2 and in_word:
doc_true_indices[len(doc_true_indices) -1].append(",%s" % j)
else:
in_word = False
y_true_indices[i] = doc_true_indices
y_pred_indices = {}
for i in range(np.shape(y_pred_2d)[0]):
doc_true_indices = []
in_word = False
for j in range(np.shape(y_pred_2d)[1]):
if y_pred_2d[i][j] == 1:
doc_true_indices.append(["%s" % j])
in_word = True
elif j > 0 and y_pred_2d[i][j] == 2 and in_word:
doc_true_indices[len(doc_true_indices) - 1].append(",%s" % j)
else :
in_word = False
y_pred_indices[i] = doc_true_indices
for i in range(len(y_pred_indices)) :
for kp in y_pred_indices[i]:
if kp in y_true_indices[i]:
true_positives += 1
return (1.0 * true_positives) / sum(len(kps) for doc,kps in y_true_indices.items())
def keras_f1(y_true,y_pred):
p = keras_precision(y_true,y_pred)
r = keras_recall(y_true,y_pred)
return (2*(p * r)) / (p + r) if p != 0 and r != 0 else 0
def keras_precision_qa(y_true,y_pred) :
# Prepare data
if np.shape(y_pred)[1] == 2:
# If one-hot prediction...
y_true = np.argmax(y_true,axis=1)
y_pred = np.argmax(y_pred,axis=1)
else:
# If similarity-based...
y_pred = np.reshape(y_pred, np.shape(y_true))
y_pred = np.round(y_pred)
den = np.count_nonzero(y_pred)
if den == 0:
logging.log(logging.WARNING,"Network did not predict any positive sample")
return 0
return np.count_nonzero(np.in1d(np.where(y_pred), np.where(y_true))) / den
def keras_recall_qa(y_true,y_pred) :
# Prepare data
if np.shape(y_pred)[1] == 2:
# If one-hot prediction...
y_true = np.argmax(y_true, axis=1)
y_pred = np.argmax(y_pred, axis=1)
else:
# If similarity-based...
y_pred = np.reshape(y_pred, np.shape(y_true))
y_pred = np.round(y_pred)
return np.count_nonzero(np.in1d(np.where(y_true), np.where(y_pred))) / np.count_nonzero(y_true)
def keras_f1_qa(y_true,y_pred):
p = keras_precision_qa(y_true,y_pred)
r = keras_recall_qa(y_true,y_pred)
return (2*(p * r)) / (p + r) if p + r > 0 else 0

82
eval/metrics.py Normal file
View File

@ -0,0 +1,82 @@
from enum import Enum
from nltk.stem import *
stemMode = Enum("StemmerMode","none both results")
def precision(reference,obtained,stem = stemMode.none):
true_positives = 0
false_positives = 0
for doc, reference_kps_tokens in reference.items():
obtained_kps_tokens = obtained[doc]
reference_kps = []
obtained_kps = []
for ref_tokens in reference_kps_tokens:
if stem == stemMode.both:
stemmer = PorterStemmer()
ref_tokens = [stemmer.stem(token) for token in ref_tokens]
reference_kp = ' '.join(ref_tokens)
reference_kps.append(reference_kp.lower())
for obt_tokens in obtained_kps_tokens:
if stem == stemMode.both or stem == stemMode.results:
stemmer = PorterStemmer()
obt_tokens = [stemmer.stem(token) for token in obt_tokens]
obt_string = ' '.join(obt_tokens).lower()
if obt_string not in obtained_kps:
# this is necessary, because if we stem the kps we may
# obtain duplicates
obtained_kps.append(obt_string)
for obt_string in obtained_kps:
if obt_string in reference_kps:
true_positives += 1
else:
false_positives += 1
return (true_positives * 1.0) / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
def recall(reference,obtained,stem=stemMode.none):
true_positives = 0
total_reference = sum(len(kps) for doc,kps in reference.items())
for doc, reference_kps_tokens in reference.items():
obtained_kps_tokens = obtained[doc]
reference_kps = []
for ref_tokens in reference_kps_tokens:
if stem == stemMode.both:
stemmer = PorterStemmer()
ref_tokens = [stemmer.stem(token) for token in ref_tokens]
reference_kp = ' '.join(ref_tokens)
reference_kps.append(reference_kp)
for obt_tokens in obtained_kps_tokens:
if stem == stemMode.both or stem == stemMode.results:
stemmer = PorterStemmer()
obt_tokens = [stemmer.stem(token) for token in obt_tokens]
obt_string = ' '.join(obt_tokens)
if obt_string in reference_kps:
true_positives += 1
reference_kps.remove(obt_string)
return (true_positives * 1.0) / total_reference
def f1(precision, recall):
return (2 * (precision * recall)) / (precision + recall) if precision + recall > 0 else 0

73939
glove/vectors.txt Normal file

File diff suppressed because it is too large Load Diff

95
nlp/chunker.py Normal file
View File

@ -0,0 +1,95 @@
import nltk
from nltk.chunk.regexp import *
from nlp import tokenizer as tk
KP_REGEX_1 = "<JJ|NN|NNP|NNS|NNPS>*<NN|NNP|NNS|NNPS|VB|VBG>"
KP_REGEX_2 = "<JJ>?<NN|NNS>+<IN><NN|NNS>"
KP_REGEX_3 = "<JJ|VBN>*<NN|NNS>"
noun_phrase_grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*|VBG>} # Nouns and Adjectives, terminated with Nouns or -ing verbs
KP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
hulth_grammar = r"""
NBAR:
{<NN.*|JJ.*>*<NN.*|VBG>} # Nouns and Adjectives, terminated with Nouns or -ing verbs
VBPART:
{<VBG|VBP><NBAR>} # Verb in participle from, then nouns
COUNT:
{<CD><NBAR>} # Numbers then nouns
NP:
{<NBAR><IN><NBAR>}
"""
hulth_labels = ['NP','NBAR','COUNT','VBPART']
def extract_candidates_from_set(set,tokenizer):
"""
Generates the candidate keyphrases for a document.
:param set: the training, test or validation set
:param tokenizer: which tokenizer to use
:return: a dictionary where each document is associated with its candidate keyphrases
"""
candidates = {}
for doc, str in set.items() :
candidates[doc] = extract_candidates(str,tokenizer)
return candidates
def extract_candidates(document,tokenizer):
"""
Extracts the candidate keyphrases from a string.
:param document: the string to analyze
:param tokenizer: the tokenizer to use
:return: the list of candidate keyphrases for the input document
"""
return extract_valid_tokens(tk.tokenize(document,tokenizer))
def extract_valid_tokens(tokens):
"""
Given a list of tokens, returns the subsets of such list which are potential keyphrases according to
the provided part-of-speech patterns.
:param document: the token list to analyze
:return: the list of candidate keyphrases for the input document
"""
postagged_doc = nltk.pos_tag(tokens)
kp_rule_1 = ChunkRule(KP_REGEX_1,"")
kp_rule_2 = ChunkRule(KP_REGEX_2, "")
kp_rule_3 = ChunkRule(KP_REGEX_3, "")
#chunk_parser = RegexpChunkParser([kp_rule_1, kp_rule_2, kp_rule_3],
# chunk_label="KP")
chunk_parser = RegexpParser(grammar=hulth_grammar)
tree = chunk_parser.parse(postagged_doc)
candidates = []
for subtree in tree.subtrees():
if subtree.label() in hulth_labels:
candidate = []
for leaf in subtree.leaves():
candidate.append(leaf[0])
if candidate not in candidates:
candidates.append(candidate)
return candidates

38
nlp/cleaner.py Normal file
View File

@ -0,0 +1,38 @@
import nltk
# NLTK 使用 Penn Treebank 标签集
# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]
def clean_tokens(keyphrase):
"""
Removes the tokens from the head and the tail of a keyphrase +
(passed as a token list) that do not match the allowed PoS tags.
:return: the cleaned keyphrase
"""
keyphrase_pos = nltk.pos_tag(keyphrase)
start = 0
for start in range(len(keyphrase_pos)):
if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
start += 1
else:
break
end = len(keyphrase) - 1
for end in range(len(keyphrase_pos) - 1,start,-1):
if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
end -= 1
else:
break
return keyphrase[start:end+1]

101
nlp/dictionary.py Normal file
View File

@ -0,0 +1,101 @@
from collections import OrderedDict
class Dictionary(object):
"""Dictionary utility class. This class is a lightweight version of the Keras text preprocessing module
(see https://github.com/fchollet/keras/blob/master/keras/preprocessing/text.py), designed to work on
tokens instead of strings.
This class is used to build a dictionary that can in turn be used to fill an Embedding layer
with word embeddings.
Please note that `0` is a reserved index that won't be assigned to any word.
The original keras.preprocessing.text module is licensed under the MIT license.
"""
def __init__(self, num_words=None):
self.word_counts = OrderedDict()
self.word_index = {}
self.reverse_word_index = None
self.num_words = num_words
self.document_count = 0
def fit_on_texts(self, tokenized_documents):
for document in tokenized_documents:
self.document_count += 1
for w in document:
if w in self.word_counts:
self.word_counts[w] += 1
else:
self.word_counts[w] = 1
wcounts = list(self.word_counts.items())
wcounts.sort(key=lambda x: x[1], reverse=True)
sorted_voc = [wc[0] for wc in wcounts]
# note that index 0 is reserved, never assigned to an existing word
self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
def texts_to_sequences(self, texts):
"""
Transforms each text in texts in a sequence of integers.
Only top "num_words" most frequent words will be taken into account.
:param texts: A list of words
:return: A list of sequences.
"""
texts_sequences = []
for text in texts:
texts_sequences.append(self.token_list_to_sequence(text))
return texts_sequences
def token_list_to_sequence(self, tokens):
"""Transforms each text in texts in a sequence of integers.
Only top "num_words" most frequent words will be taken into account.
Only words known by the tokenizer will be taken into account.
# Arguments
tokens: A list of texts (strings).
# Yields
Yields individual sequences.
"""
vect = []
for w in tokens:
i = self.word_index.get(w)
if i is not None:
if self.num_words and i >= self.num_words:
continue
else:
vect.append(i)
return vect
def tokens_to_words(self, tokens):
"""
Utility that prints the words associated to the provided indices.
:param tokens: a list of integers
"""
if not self.reverse_word_index:
self.build_reverse_word_index()
words = []
for token in tokens:
if token != 0:
words.append(self.reverse_word_index[token])
return words
def build_reverse_word_index(self):
self.reverse_word_index = {}
for key, value in self.word_index.items():
self.reverse_word_index[value] = key

37
nlp/tokenizer.py Normal file
View File

@ -0,0 +1,37 @@
from enum import Enum
import keras.preprocessing.text
import nltk
tokenizers = Enum("Tokenizers","nltk keras")
def tokenize_set(documents,answers,tokenizer):
tokenized_docs = {}
for doc, str in documents.items():
tokenized_docs[doc] = tokenize(str, tokenizer)
tokenized_answers = {}
for doc, answers in answers.items():
for answer in answers :
if doc not in tokenized_answers:
tokenized_answers[doc] = [tokenize(answer,tokenizer)]
else:
tokenized_answers[doc].append(tokenize(answer,tokenizer))
return tokenized_docs,tokenized_answers
def tokenize(string,tokenizer = tokenizers.keras):
"""
Tokenizes a string using the selected tokenizer.
:param string: the string to tokenize
:param tokenizer: which tokenizer to use (nltk or keras)
:return: the list of tokens
"""
if tokenizer == tokenizers.nltk:
return nltk.word_tokenize(string.lower())
elif tokenizer == tokenizers.keras:
return keras.preprocessing.text.text_to_word_sequence(string)
else:
raise NotImplementedError()

25
utils/glove.py Normal file
View File

@ -0,0 +1,25 @@
import os,sys
os.chdir(sys.path[0]) #相对路径
import numpy as np
import logging
# 加载词向量
def load_glove(glove_dir,size):
embeddings_index = {}
glove_path = ("/home/zhangxj/WorkFile/本科毕业设计/glove/vectors.txt")
logging.debug("Loading GloVe pre-trained embeddings from %s" % glove_path)
f = open(os.path.join(glove_dir, glove_path))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
logging.debug('Total embeddings found: %s.' % len(embeddings_index))
return embeddings_index

14
utils/info.py Normal file
View File

@ -0,0 +1,14 @@
import logging
def log_versions():
import keras
logging.info("Keras version %s" % keras.__version__)
import numpy as np
logging.info("Numpy version %s" % np.__version__)
if keras.backend.backend() == 'theano':
import theano
logging.info("Theano version %s" % theano.__version__)
else:
import tensorflow
logging.info("Tensorflow version %s" % tensorflow.__version__)

32
utils/plots.py Normal file
View File

@ -0,0 +1,32 @@
import matplotlib.pyplot as plt
def plot_accuracy(history) :
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy over epochs')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.show()
def plot_loss(history) :
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss over epochs')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.show()
def plot_prf(history) :
plt.plot(history.history['precision'])
plt.plot(history.history['recall'])
plt.plot(history.history['f1'])
plt.title('P/R/F1 scores on validation set')
plt.ylabel('score')
plt.xlabel('epoch')
plt.legend(['Precision', 'Recall', 'F1'], loc='upper left')
plt.show()

258
utils/postprocessing.py Normal file
View File

@ -0,0 +1,258 @@
import itertools
import numpy as np
from nlp import chunker, cleaner
def undo_sequential(output):
"""
Transforms a 3D one-hot array of the type (documents,token,category)
in a 2D array of the type (documents,token_category).
:param output: a one-hot 3D array
:return: a 2D array
"""
return np.argmax(output,axis=2)
def get_words(docs, selections):
"""
Gets the selected words in the provided documents.
:param docs: the document to analyze
:param selections: the words selected in the documents
:return: a dictionary with the documents and for each a list of
the selected words
"""
i = 0
obtained_words = {}
for doc, words in docs.items():
k = 0
obtained_words_doc = []
in_word = False
for token in selections[i]:
if token == 1 and k < len(words):
obtained_words_doc.append([words[k]])
in_word = True
elif token == 2 and k < len(words) and in_word:
obtained_words_doc[len(obtained_words_doc) - 1].append(words[k])
else:
in_word = False
k += 1
# remove duplicate selections
obtained_words_doc.sort()
obtained_words_doc = list(w for w, _ in itertools.groupby(obtained_words_doc))
obtained_words[doc] = obtained_words_doc
i += 1
return obtained_words
def get_top_words(docs,output,words_limit):
"""
Gets the selected words in the provided documents.
:param docs: the document to analyze
:param output: the output of the network
:param words_limit: how many words to extract
:return: a dictionary with the documents and for each a list of
the selected words
"""
selections = undo_sequential(output)
i = 0
obtained_words = {}
for doc, words in docs.items():
k = 0
obtained_words_doc = []
obtained_words_weights = []
in_word = False
for token in selections[i]:
if token == 1 and k < len(words):
obtained_words_doc.append([words[k]])
obtained_words_weights.append(output[i,k,1])
in_word = True
elif token == 2 and k < len(words) and in_word:
obtained_words_doc[len(obtained_words_doc) - 1].append(words[k])
obtained_words_weights[len(obtained_words_weights) - 1] = \
obtained_words_weights[len(obtained_words_weights) - 1] + \
((output[i,k,2] - obtained_words_weights[len(obtained_words_weights) - 1]) /
(len(obtained_words_doc[len(obtained_words_doc) - 1])))
# We calculate the average at the nth step this way:
# If A_i is the average at the ith step and x_i is the ith item of the sequence, then
# A_k = A_{k-1} + ((x_k - A_{k-1}) / k)
else:
in_word = False
k += 1
if words_limit < len(obtained_words_doc):
# there are more selections than the limit! cut them
obtained_words_and_scores = {}
for index, words in enumerate(obtained_words_doc):
obtained_words_and_scores[index] = obtained_words_weights[index]
sorted_words = sorted(obtained_words_and_scores, key=obtained_words_and_scores.__getitem__,reverse=True)
ok_obtained_words = []
cur_word = 0
while len(ok_obtained_words) < words_limit and cur_word < len(sorted_words):
if obtained_words_doc[sorted_words[cur_word]] not in ok_obtained_words:
ok_obtained_words.append(obtained_words_doc[sorted_words[cur_word]])
cur_word += 1
obtained_words_doc = ok_obtained_words
else:
# just remove duplicate selections
obtained_words_doc.sort()
obtained_words_doc = list(w for w, _ in itertools.groupby(obtained_words_doc))
obtained_words[doc] = obtained_words_doc
i += 1
return obtained_words
def get_valid_patterns(answer_set):
"""
Remove the answers from a set that do NOT match the keyphrase part-of-speech patterns.
:param answer_set: a dictionary of documents and tokenized keyphrases
:return: a dictionary of documents and tokenized keyphrases that match the part-of-speech patterns
"""
doc_filtered = {}
for doc, kps in answer_set.items():
filtered_keyphrases = []
for kp in kps:
for valid_kp in chunker.extract_valid_tokens(kp):
filtered_keyphrases.append(valid_kp)
# remove duplicates
filtered_keyphrases.sort()
filtered_keyphrases = list(w for w, _ in itertools.groupby(filtered_keyphrases))
doc_filtered[doc] = filtered_keyphrases
return doc_filtered
def clean_answers(answer_set):
"""
Cleans the keyphrases by removing the tokens that are not PoS tagged with the allowed tags.
:param answer_set: a dictionary of documents and tokenized keyphrases
:return: a dictionary of documents and their cleaned tokenized keyphrases
"""
doc_filtered = {}
for doc, kps in answer_set.items():
filtered_keyphrases = []
for kp in kps:
clean_kp = cleaner.clean_tokens(kp)
if clean_kp:
filtered_keyphrases.append(clean_kp)
# 去重
filtered_keyphrases.sort()
filtered_keyphrases = list(w for w, _ in itertools.groupby(filtered_keyphrases))
doc_filtered[doc] = filtered_keyphrases
return doc_filtered
def get_answers(candidate_tokens,predict_set,predict_result,dictionary):
"""
Build the dictionary of the selected answer for a QA-based network.
:param candidate_tokens: the dictionary of the documents and their candidate KPs
:param predict_set: the input of the network
:param predict_result: the output of the network
:param dictionary: the previously-fit word index
:return: the dictionary of the selected KPs
"""
# Here the ideas is: we go through the dictionary of the candidates, we find the corresponding
# model input, and we add the candidate to the answer set if the model predicted class 1 (i.e. that the candidate
# was a correct KP
# First, get the actual predictions:
if np.shape(predict_result)[1] == 1:
# If we have just 1 output neuron, reshape and put make the output in 0,1 values
predictions_flattened = np.round(np.reshape(predict_result,np.shape(predict_result)[0]))
else:
# If we're working with categorical output, flatten the (num_samples,2) array to a (num_samples) one
# This way transform a 2D array e.g. [[0.6,0.4] ... [0.2,0.8]] to a 1D array e.g. [0...1]
predictions_flattened = np.argmax(predict_result, axis=1)
i = 0
answers = {}
for doc_id, candidate_list in candidate_tokens.items() :
answers[doc_id] = []
for candidate in candidate_list:
# Sanity check: was the order preserved?
assert candidate == dictionary.tokens_to_words(predict_set[1][i])
if predictions_flattened[i] == 1 :
answers[doc_id].append(candidate)
i += 1
return answers
def get_top_answers(candidate_tokens,predict_set,predict_result,dictionary,limit):
"""
Build the dictionary of the selected answer for a QA-based network.
:param candidate_tokens: the dictionary of the documents and their candidate KPs
:param predict_set: the input of the network
:param predict_result: the output of the network
:param dictionary: the previously-fit word index
:return: the dictionary of the selected KPs
"""
# Here the ideas is: we go through the dictionary of the candidates, we find the corresponding
# model input, and we add the candidate to the answer set if the model predicted class 1 (i.e. that the candidate
# was a correct KP
# First, get the actual predictions:
if np.shape(predict_result)[1] == 1:
# If we have just 1 output neuron, reshape and put the output in 0,1 values
predictions_flattened = np.round(np.reshape(predict_result,np.shape(predict_result)[0]))
else:
# If we're working with categorical output, flatten the (num_samples,2) array to a (num_samples) one
# This way transform a 2D array e.g. [[0.6,0.4] ... [0.2,0.8]] to a 1D array e.g. [0...1]
predictions_flattened = np.argmax(predict_result, axis=1)
i = 0
answers = {}
scores = {}
for doc_id, candidate_list in candidate_tokens.items() :
answers[doc_id] = []
scores[doc_id] = []
for candidate in candidate_list:
# Sanity check: was the order preserved?
assert candidate == dictionary.tokens_to_words(predict_set[1][i])
if predictions_flattened[i] == 1 :
answers[doc_id].append(candidate)
if np.shape(predict_result)[1] == 1:
scores[doc_id].append(predict_result[i][0])
else:
scores[doc_id].append(predict_result[i][1])
i += 1
if len(answers[doc_id]) > limit :
answers[doc_id] = [x for _,x in sorted(zip(scores[doc_id],answers[doc_id]),reverse=True)][:limit]
return answers

560
utils/preprocessing.py Normal file
View File

@ -0,0 +1,560 @@
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from utils import glove
from nlp import dictionary as dict
import logging
import numpy as np
import random
def prepare_answer(train_doc, train_answer, train_candidates,
test_doc, test_answer, test_candidates,
val_doc=None, val_answer=None, val_candidates=None,
max_document_length=1000,
max_answer_length=20,
max_vocabulary_size=50000,
embeddings_size=50):
"""
Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
previously for the training, test and validation sets as candidate for all three sets.
:param train_doc: the training documents
:param train_answer: the KPs for the training documents
:param train_candidates: the candidate KPs for the training documents
:param test_doc: the test documents
:param test_answer: the KPs for the test documents
:param test_candidates: the candidate KPs for the test documents
:param val_doc: the validation documents (can be None)
:param val_answer: the KPs for the validation documents (can be None)
:param val_candidates: the candidate KPs for the validation documents (can be None)
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
:param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
:param max_vocabulary_size: the maximum size of the vocabulary to use
(i.e. we keep only the top max_vocabulary_size words)
:param embeddings_size: the size of the GLoVE embeddings to use
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
test and validation set, and an embedding matrix for an Embedding layer
"""
# Prepare validation return data
val_q = None
val_a = None
val_y = None
# Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
train_q = []
test_q = []
train_a = []
test_a = []
train_y = []
test_y = []
if val_doc and val_answer:
val_q = []
val_a = []
val_y = []
documents_full = []
for key, doc in train_doc.items():
documents_full.append(token for token in doc)
for key, doc in test_doc.items():
documents_full.append(token for token in doc)
if val_doc and val_answer:
for key, doc in val_doc.items():
documents_full.append(token for token in doc)
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
dictionary.fit_on_texts(documents_full)
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
# Pair up each document with a candidate keyphrase and its truth value
for key, document in train_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
for kp in train_candidates[key]:
train_q.append(doc_sequence)
train_a.append(dictionary.token_list_to_sequence(kp))
train_y.append([0, 1] if kp in train_answer[key] else [1, 0])
for key, document in test_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
for kp in test_candidates[key]:
test_q.append(doc_sequence)
test_a.append(dictionary.token_list_to_sequence(kp))
test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
if val_doc and val_answer:
for key, document in val_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
for kp in val_candidates[key]:
val_q.append(doc_sequence)
val_a.append(dictionary.token_list_to_sequence(kp))
val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len)))
logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len)))
logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len)))
logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len)))
if val_doc and val_answer:
logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len)))
train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
if val_doc and val_answer:
val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
logging.debug("Training set documents size : %s", np.shape(train_q))
logging.debug("Training set answers size : %s", np.shape(train_a))
logging.debug("Test set documents size : %s", np.shape(test_q))
logging.debug("Test set answers size : %s ", np.shape(test_a))
if val_doc and val_answer:
logging.debug("Validation set documents size : %s", np.shape(val_q))
logging.debug("Validation set answers size : %s ", np.shape(val_a))
# prepare the matrix for the embedding layer
word_index = dictionary.word_index
embeddings_index = glove.load_glove('', embeddings_size)
num_words = min(max_vocabulary_size, 1 + len(word_index))
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
embedding_matrix = np.zeros((num_words, embeddings_size))
for word, i in word_index.items():
if i >= num_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, embedding_matrix, dictionary
def prepare_answer_2(train_doc, train_answer, train_candidates,
test_doc, test_answer, test_candidates,
val_doc=None, val_answer=None, val_candidates=None,
max_document_length=1000,
max_answer_length=20,
max_vocabulary_size=50000,
embeddings_size=50):
"""
Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
previously for the test and validation sets as candidate for these sets, and mix the correct answers with
wrong patterns on the training set to build in order to have balanced data for training.
:param train_doc: the training documents
:param train_answer: the KPs for the training documents
:param train_candidates: the candidate KPs for the training documents
:param test_doc: the test documents
:param test_answer: the KPs for the test documents
:param test_candidates: the candidate KPs for the test documents
:param val_doc: the validation documents (can be None)
:param val_answer: the KPs for the validation documents (can be None)
:param val_candidates: the candidate KPs for the validation documents (can be None)
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
:param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
:param max_vocabulary_size: the maximum size of the vocabulary to use
(i.e. we keep only the top max_vocabulary_size words)
:param embeddings_size: the size of the GLoVE embeddings to use
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
test and validation set, and an embedding matrix for an Embedding layer
"""
# Prepare validation return data
val_q = None
val_a = None
val_y = None
val_q_balanced = None
val_a_balanced = None
val_y_balanced = None
# Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
train_q = []
test_q = []
train_a = []
test_a = []
train_y = []
test_y = []
if val_doc and val_answer:
val_q = []
val_a = []
val_y = []
val_q_balanced = []
val_a_balanced = []
val_y_balanced = []
documents_full = []
for key, doc in train_doc.items():
documents_full.append(token for token in doc)
for key, doc in test_doc.items():
documents_full.append(token for token in doc)
if val_doc and val_answer:
for key, doc in val_doc.items():
documents_full.append(token for token in doc)
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
dictionary.fit_on_texts(documents_full)
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
# Pair up each document with a candidate keyphrase and its truth value
for key, document in train_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
# select wrong candidates (possibly, in same quantity as good answers)
wrong_candidates = list(train_candidates[key])
for answer in train_answer[key]:
if answer in wrong_candidates:
wrong_candidates.remove(answer)
while len(wrong_candidates) > len(train_answer[key]):
random_candidate = random.choice(wrong_candidates)
wrong_candidates.remove(random_candidate)
# append wrong candidates
for kp in wrong_candidates:
train_q.append(doc_sequence)
train_a.append(dictionary.token_list_to_sequence(kp))
train_y.append([1, 0])
# append true answers
for kp in train_answer[key]:
train_q.append(doc_sequence)
train_a.append(dictionary.token_list_to_sequence(kp))
train_y.append([0, 1])
if val_doc and val_answer:
for key, document in val_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
# select wrong candidates (possibly, in same quantity as good answers)
wrong_candidates = list(val_candidates[key])
for answer in val_answer[key]:
if answer in wrong_candidates:
wrong_candidates.remove(answer)
while len(wrong_candidates) > len(val_answer[key]):
random_candidate = random.choice(wrong_candidates)
wrong_candidates.remove(random_candidate)
# append wrong candidates
for kp in wrong_candidates:
val_q_balanced.append(doc_sequence)
val_a_balanced.append(dictionary.token_list_to_sequence(kp))
val_y_balanced.append([1, 0])
# append true answers
for kp in val_answer[key]:
val_q_balanced.append(doc_sequence)
val_a_balanced.append(dictionary.token_list_to_sequence(kp))
val_y_balanced.append([0, 1])
# for the other sets, just pick the auto-generated candidates
for key, document in test_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
for kp in test_candidates[key]:
test_q.append(doc_sequence)
test_a.append(dictionary.token_list_to_sequence(kp))
test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
if val_doc and val_answer:
for key, document in val_doc.items():
doc_sequence = dictionary.token_list_to_sequence(document)
for kp in val_candidates[key]:
val_q.append(doc_sequence)
val_a.append(dictionary.token_list_to_sequence(kp))
val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len)))
logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len)))
logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len)))
logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len)))
if val_doc and val_answer:
logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len)))
logging.debug("Longest balanced validation document : %s tokens" % len(max(val_q, key=len)))
logging.debug("Longest balanced validation answer : %s tokens" % len(max(val_a, key=len)))
train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
if val_doc and val_answer:
val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
val_q_balanced = np.asarray(pad_sequences(val_q_balanced, maxlen=max_document_length, padding='post', truncating='post'))
val_a_balanced = np.asarray(pad_sequences(val_a_balanced, maxlen=max_answer_length, padding='post', truncating='post'))
logging.debug("Training set documents size : %s", np.shape(train_q))
logging.debug("Training set answers size : %s", np.shape(train_a))
logging.debug("Test set documents size : %s", np.shape(test_q))
logging.debug("Test set answers size : %s ", np.shape(test_a))
if val_doc and val_answer:
logging.debug("Validation set documents size : %s", np.shape(val_q))
logging.debug("Validation set answers size : %s ", np.shape(val_a))
logging.debug("Balanced Validation set documents size : %s", np.shape(val_q_balanced))
logging.debug("Balanced Validation set answers size : %s ", np.shape(val_a_balanced))
# prepare the matrix for the embedding layer
word_index = dictionary.word_index
embeddings_index = glove.load_glove('', embeddings_size)
num_words = min(max_vocabulary_size, 1 + len(word_index))
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
embedding_matrix = np.zeros((num_words, embeddings_size))
for word, i in word_index.items():
if i >= num_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, \
[val_q_balanced, val_a_balanced], val_y_balanced, embedding_matrix, dictionary
def prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
max_document_length=35000,
max_vocabulary_size=5000,
embeddings_size=50,
stem_test = False):
"""
Prepares a dataset for use by a sequential, categorical model.
:param train_doc: the training documents
:param train_answer: the KPs for the training documents
:param test_doc: the test documents
:param test_answer: the KPs for the test documents
:param val_doc: the validation documents (can be None)
:param val_answer: the KPs for the validation documents (can be None)
:param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
:param max_vocabulary_size: the maximum size of the vocabulary to use
(i.e. we keep only the top max_vocabulary_size words)
:param embeddings_size: the size of the GLoVE embeddings to use
:param stem_test: set the value to True if the test set answers are stemmed
:return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
test and validation set, and an embedding matrix for an Embedding layer
"""
train_answer_seq = make_sequential(train_doc, train_answer)
if not stem_test:
test_answer_seq = make_sequential(test_doc, test_answer)
else:
import copy
stemmed_test_doc = copy.deepcopy(test_doc)
stemmed_test_doc = stem_dataset(stemmed_test_doc)
test_answer_seq = make_sequential(stemmed_test_doc,test_answer)
# Prepare validation return data
val_x = None
val_y = None
if val_doc and val_answer:
val_answer_seq = make_sequential(val_doc, val_answer)
# Transform the documents to sequence
documents_full = []
train_y = []
test_y = []
if val_doc and val_answer:
val_y = []
for key, doc in train_doc.items():
documents_full.append(token for token in doc)
train_y.append(train_answer_seq[key])
for key, doc in test_doc.items():
documents_full.append(token for token in doc)
test_y.append(test_answer_seq[key])
if val_doc and val_answer:
for key, doc in val_doc.items():
documents_full.append(token for token in doc)
val_y.append(val_answer_seq[key])
logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
dictionary = dict.Dictionary(num_words=max_vocabulary_size)
dictionary.fit_on_texts(documents_full)
logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
# Now we can prepare the actual input
train_x = dictionary.texts_to_sequences(train_doc.values())
test_x = dictionary.texts_to_sequences(test_doc.values())
if val_doc and val_answer:
val_x = dictionary.texts_to_sequences(val_doc.values())
logging.debug("Longest training document : %s tokens" % len(max(train_x, key=len)))
logging.debug("Longest test document : %s tokens" % len(max(test_x, key=len)))
if val_doc and val_answer:
logging.debug("Longest validation document : %s tokens" % len(max(val_x, key=len)))
train_x = np.asarray(pad_sequences(train_x, maxlen=max_document_length, padding='post', truncating='post'))
train_y = pad_sequences(train_y, maxlen=max_document_length, padding='post', truncating='post')
train_y = make_categorical(train_y)
test_x = np.asarray(pad_sequences(test_x, maxlen=max_document_length, padding='post', truncating='post'))
test_y = pad_sequences(test_y, maxlen=max_document_length, padding='post', truncating='post')
test_y = make_categorical(test_y)
if val_doc and val_answer:
val_x = np.asarray(pad_sequences(val_x, maxlen=max_document_length, padding='post', truncating='post'))
val_y = pad_sequences(val_y, maxlen=max_document_length, padding='post', truncating='post')
val_y = make_categorical(val_y)
logging.debug("Training set samples size : %s", np.shape(train_x))
logging.debug("Training set answers size : %s", np.shape(train_y))
logging.debug("Test set samples size : %s", np.shape(test_x))
logging.debug("Test set answers size : %s ", np.shape(test_y))
if val_doc and val_answer:
logging.debug("Validation set samples size : %s", np.shape(val_x))
logging.debug("Validation set answers size : %s ", np.shape(val_y))
# prepare the matrix for the embedding layer
word_index = dictionary.word_index
embeddings_index = glove.load_glove('', embeddings_size)
num_words = min(max_vocabulary_size, 1 + len(word_index))
logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
embedding_matrix = np.zeros((num_words, embeddings_size))
for word, i in word_index.items():
if i >= num_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix
def make_sequential(documents, answers):
"""
Transform an answer-based dataset (i.e. with a list of
documents and a list of keyphrases) to a sequential, ner-like
dataset, i.e. where the answer set for each document is composed
by the lists of the documents' tokens marked as non-keyphrase (0),
beginning of keyphrase (1) and inside-keyphrase (2).
For example, for the tokens
"I am a python developer since today."
If the keyphrases are "python developer" and "today"" the answer
set for these tokens is
"[0 0 0 1 2 0 1]"
:param documents: the list of documents
:param answers: the list of keyphrases
:return: the new answer set
"""
seq_answers = {}
for key, document in documents.items():
doc_answers_set = answers[key]
'''
按关键字的长度排序 我们首先处理较短的 KP
如果它们包含在更长的 KP 将简单地覆盖
短的配长的
'''
doc_answers_set.sort(key=lambda a: len(a))
'''
该字段将包含答案
我们将它初始化为一个零列表然后我们将填充它
1s 2s 之后
'''
doc_answers_seq = [0] * len(document)
for answer in doc_answers_set:
# 查找 KP 出现的第一个单词的位置
appearances = [i for i, word in enumerate(document) if word == answer[0]]
for idx in appearances:
is_kp = True
# 检查 KP 是否也从它的第二个词开始匹配
for i in range(1, len(answer)):
if (i + idx) < len(document):
is_kp = answer[i] == document[i + idx]
else:
# 文档结尾
is_kp = False
# 如果我们找到了实际的 KP请在输出列表中标记标记。
if is_kp:
doc_answers_seq[idx] = 1
for i in range(1, len(answer)):
doc_answers_seq[idx + i] = 2
seq_answers[key] = doc_answers_seq
return seq_answers
def make_categorical(x):
"""
Transform a two-dimensional list into a 3-dimensional array. The 2nd
dimension of the input list becomes a one-hot 2D array, e.g.
if the input is [[1,2,0],...], the output will be
[[[0,1,0],[0,0,1],[1,0,0]],...]
:param x: a 2D-list
:return: a 3D-numpy array
"""
# 类别数量
num_categories = max([item for sublist in x for item in sublist]) + 1
# numpy格式输出
new_x = np.zeros((len(x), len(x[0]), num_categories))
# 使用keras进行实际的分类转换
i = 0
for doc in x:
new_doc = np_utils.to_categorical(doc, num_classes=num_categories)
new_x[i] = new_doc
i += 1
return new_x
def stem_dataset(dataset):
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
for key, tokens in dataset.items():
stemmed_tokens = [stemmer.stem(token) for token in tokens]
dataset[key] = stemmed_tokens
return dataset