first commit

2023-04-19 13:16:27 +08:00 · 2023-04-19 13:16:27 +08:00 · 7e701970c7
parent 850c1fc3f4
commit 7e701970c7
15 changed files with 75814 additions and 0 deletions
--- a/Bi-LSTM.py
+++ b/Bi-LSTM.py
@ -0,0 +1,173 @@
+import os,sys
+os.chdir(sys.path[0])
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+from datasets import EnNews
+
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+ 
+import numpy as np
+import random as rn
+
+np.random.seed(421)
+rn.seed(12345)
+
+import logging
+
+from keras import regularizers
+from keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM, TimeDistributed
+
+from keras.models import Sequential, load_model
+
+from datasets import *
+from eval import keras_metrics, metrics
+from nlp import tokenizer as tk
+from utils import info, preprocessing, postprocessing, plots
+
+# 记录配置
+
+logging.basicConfig(
+    format='%(asctime)s\t%(levelname)s\t%(message)s',
+    level=logging.DEBUG)
+
+info.log_versions()
+
+# 全局变量
+
+SAVE_MODEL = False
+MODEL_PATH = "models/bilstm.h5"
+SHOW_PLOTS = False
+
+# 数据集和超参数
+Dataset = EnNews
+
+rootpath = "/home/zhangxj/WorkFile/本科毕业设计"
+
+tokenizer = tk.tokenizers.nltk
+DATASET_FOLDER = rootpath+"/EnergyNews"
+MAX_DOCUMENT_LENGTH = 400
+MAX_VOCABULARY_SIZE = 20000
+EMBEDDINGS_SIZE = 50
+batch_size = 32
+epochs = 20
+KP_WEIGHT = 10
+STEM_MODE = metrics.stemMode.both
+STEM_TEST = False
+
+
+
+# 加载数据集
+logging.info("Loading dataset...")
+
+data = Dataset(DATASET_FOLDER)
+
+train_doc_str, train_answer_str = data.load_train()
+test_doc_str, test_answer_str = data.load_test()
+val_doc_str, val_answer_str = data.load_validation()
+
+train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str, tokenizer)
+test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str, tokenizer)
+val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer)
+
+# 完整性检查
+
+logging.info("Dataset loaded. Preprocessing data...")
+
+train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix = preprocessing. \
+    prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
+                       max_document_length=MAX_DOCUMENT_LENGTH,
+                       max_vocabulary_size=MAX_VOCABULARY_SIZE,
+                       embeddings_size=EMBEDDINGS_SIZE,
+                       stem_test=STEM_TEST)
+
+# 权重训练示例：所有不是 kp的内容
+from sklearn.utils import class_weight
+
+train_y_weights = np.argmax(train_y, axis=2)
+train_y_weights = np.reshape(class_weight.compute_sample_weight('balanced', train_y_weights.flatten()),
+                             np.shape(train_y_weights))
+
+logging.info("数据预处理完成")
+logging.info("可能的最大召回率: %s",
+             metrics.recall(test_answer,
+                            postprocessing.get_words(test_doc, postprocessing.undo_sequential(test_y)),
+                            STEM_MODE))
+
+if not SAVE_MODEL or not os.path.isfile(MODEL_PATH):
+
+    logging.debug("建立网络...")
+    model = Sequential()
+    print("-------",np.shape(embedding_matrix)[0])
+    embedding_layer = Embedding(np.shape(embedding_matrix)[0],
+                                EMBEDDINGS_SIZE,
+                                weights=[embedding_matrix],
+                                input_length=MAX_DOCUMENT_LENGTH,
+                                trainable=False)
+
+    model.add(embedding_layer)
+    model.add(Bidirectional(LSTM(300, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
+    model.add(Dropout(0.25))
+    model.add(TimeDistributed(Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01))))
+    model.add(Dropout(0.25))
+    model.add(TimeDistributed(Dense(2, activation='softmax')))
+
+    logging.info("编译网络...")
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
+                  sample_weight_mode="temporal")
+    print(model.summary())
+
+    metrics_callback = keras_metrics.MetricsCallback(val_x, val_y)
+
+    logging.info("拟合网络...")
+
+    history = model.fit(train_x, train_y,
+                        validation_data=(val_x, val_y),
+                        epochs=epochs,
+                        batch_size=batch_size,
+                        sample_weight=train_y_weights,
+                        callbacks=[metrics_callback])
+
+    if SHOW_PLOTS:
+        plots.plot_accuracy(history)
+        plots.plot_loss(history)
+        plots.plot_prf(metrics_callback)
+
+    if SAVE_MODEL:
+        model.save(MODEL_PATH)
+        logging.info("模型保存路径 in %s", MODEL_PATH)
+
+else:
+    logging.info("加载模型 %s...", MODEL_PATH)
+    model = load_model(MODEL_PATH)
+    logging.info("加载模型完成")
+
+logging.info("在测试集上预测...")
+output = model.predict(x=test_x, verbose=1)
+logging.debug("输出格式: %s", np.shape(output))
+
+obtained_tokens = postprocessing.undo_sequential(output)
+obtained_words = postprocessing.get_words(test_doc, obtained_tokens)
+
+precision = metrics.precision(test_answer, obtained_words,STEM_MODE)
+recall = metrics.recall(test_answer, obtained_words,STEM_MODE)
+f1 = metrics.f1(precision, recall)
+
+print("###    获得的分数    ###")
+print("###")
+print("### Precision : %.4f" % precision)
+print("### Recall    : %.4f" % recall)
+print("### F1        : %.4f" % f1)
+print("###                       ###")
+
+keras_precision = keras_metrics.keras_precision(test_y, output)
+keras_recall = keras_metrics.keras_recall(test_y, output)
+keras_f1 = keras_metrics.keras_f1(test_y, output)
+
+print("###    获得的分数    ###")
+print("###")
+print("### Precision : %.4f" % keras_precision)
+print("### Recall    : %.4f" % keras_recall)
+print("### F1        : %.4f" % keras_f1)
+print("###                       ###")
+
--- a/datasets.py
+++ b/datasets.py
@ -0,0 +1,209 @@
+import logging
+import os
+
+from nlp import tokenizer as tk
+
+
+class Dataset(object):
+    """
+    An abstract class that represents a dataset.
+    """
+
+    def __init__(self, name, path):
+        self.path = path
+        self.name = name
+        self.test_documents = None
+        self.test_answers = None
+        self.train_documents = None
+        self.train_answers = None
+        self.validation_documents = None
+        self.validation_answers = None
+
+        logging.debug("初始化数据集 %s 文件夹路径 %s" %
+                      (self.name, self.path))
+
+    def __str__(self):
+        return '数据集 %s 所在路径 %s' % (self.name, self.path)
+
+    def _load_test_documents(self):
+        """
+        Loads the test documents.
+
+        :return: a list of documents.
+        """
+        raise NotImplementedError
+
+    def _load_test_answers(self):
+        """
+        Loads the answers for the test documents.
+        :return: a list of answers.
+        """
+        raise NotImplementedError
+
+    def _load_train_documents(self):
+        """
+        Loads the train documents.
+
+        :return: a list of documents.
+        """
+        raise NotImplementedError
+
+    def _load_train_answers(self):
+        """
+        Loads the answers for the train documents.
+        :return: a list of answers.
+        """
+        raise NotImplementedError
+
+    def _load_validation_documents(self):
+        """
+        Loads the validation documents.
+
+        :return: a list of documents.
+        """
+        raise NotImplementedError
+
+    def _load_validation_answers(self):
+        """
+        Loads the answers for the validation documents.
+        :return: a list of answers.
+        """
+        raise NotImplementedError
+
+    def load_test(self):
+        """
+        Loads the test documents and their answers.
+        :return: a tuple containing the test documents and the test answers.
+        """
+
+        if not self.test_documents:
+            self.test_documents = self._load_test_documents()
+
+        if not self.test_answers:
+            self.test_answers = self._load_test_answers()
+
+        assert (len(self.test_documents) == len(self.test_answers)), \
+            "You have not enough (or too many) test answers for your documents!"
+
+        logging.debug("为数据集加载测试集 %s" % self.name)
+
+        return self.test_documents, self.test_answers
+
+    def load_train(self):
+        """
+        Loads the training documents and their answers.
+        :return: a tuple containing the train documents and the training answers.
+        """
+        if not self.train_documents:
+            self.train_documents = self._load_train_documents()
+
+        if not self.train_answers:
+            self.train_answers = self._load_train_answers()
+
+        assert (len(self.train_documents) == len(self.train_answers)), \
+            "You have not enough (or too many) train answers for your documents!"
+
+        logging.debug("为数据集加载训练集 %s" % self.name)
+
+        return self.train_documents, self.train_answers
+
+    def load_validation(self):
+        """
+        Loads the validation documents and their answers.
+        :return: a tuple containing the validation documents and the training answers.
+        """
+        if not self.validation_documents:
+            self.validation_documents = self._load_validation_documents()
+
+        if not self.validation_answers:
+            self.validation_answers = self._load_validation_answers()
+
+        assert (not self.validation_answers and not self.validation_answers) or \
+            (len(self.validation_documents) == len(self.validation_answers)), \
+            "You have not enough (or too many) validation answers for your documents!"
+
+        logging.debug("为数据集加载验证集 %s" % self.name)
+
+        return self.validation_documents, self.validation_answers
+
+
+class EnNews(Dataset):
+    """
+    Dataset from Annette Hulth's "Improved Automatic Keyword Extraction
+    Given More Linguistic Knowledge"
+
+    Note: to make the results obtained with this dataset comparable to
+    the ones described in Hulth's paper, only the "uncontrolled" terms
+    are used.
+
+    Full-text here: http://www.aclweb.org/anthology/W03-1028
+    """
+
+    def __init__(self, path):
+        super().__init__("EnergyNews", path)
+
+    def __load_documents(self, folder):
+        """
+        Loads the documents in the .abstr files contained
+        in the specified folder and puts them in a dictionary
+        indexed by document id (i.e. the filename without the
+        extension).
+
+        :param folder: the folder containing the documents
+        :return: a dictionary with the documents
+        """
+
+        # This dictionary will contain the documents
+        documents = {}
+
+        for doc in os.listdir("%s/%s" % (self.path, folder)):
+            if doc.endswith(".clr"):
+                content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
+                documents[doc[:doc.find('.')]] = content
+               
+        return documents
+
+    def __load_answers(self, folder):
+        """
+        Loads the answers contained in the .contr and .uncontr files
+        and puts them in a dictionary indexed by document ID
+        (i.e. the document name without the extension)
+        :param folder: the folder containing the answer files
+        :return: a dictionary with the answers
+        """
+
+        # This dictionary will contain the answers
+        answers = {}
+
+        for doc in os.listdir("%s/%s" % (self.path, folder)):
+            if doc.endswith(".key"):
+                content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
+                retrieved_answers = content.split(' ')
+                doc_id = doc[:doc.find('.')]
+                for answer in retrieved_answers:
+                    answer = answer.strip() # 移除字符串头尾指定的字符（默认为空格或换行符）
+                    if doc_id not in answers:
+                        answers[doc_id] = [answer]
+                    else:
+                        answers[doc_id].append(answer)
+
+        return answers
+
+    def _load_test_documents(self):
+        return self.__load_documents("test")
+
+    def _load_train_documents(self):
+        return self.__load_documents("train")
+
+    def _load_validation_documents(self):
+        return self.__load_documents("validation")
+
+    def _load_test_answers(self):
+        return self.__load_answers("test")
+
+    def _load_train_answers(self):
+        return self.__load_answers("train")
+
+    def _load_validation_answers(self):
+        return self.__load_answers("validation")
+    
--- a/eval/anno_generator.py
+++ b/eval/anno_generator.py
@ -0,0 +1,31 @@
+import os
+
+
+def write_anno(output_folder, documents, keyphrases):
+    # create output directory if not exists
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    for doc_id, doc_string in documents.items():
+
+        i = 0
+        output_file = open("%s/%s.%s" % (output_folder, doc_id, "ann"), "w")
+
+        for kp in keyphrases[doc_id]:
+            kp_string = ' '.join(kp)
+
+            for start_index in list(find_all(doc_string, kp_string)):
+                end_index = start_index + len(kp_string)
+                output_file.write("T%s\t%s %s %s\t%s\n" %
+                                  (i, "NO_TYPE", start_index, end_index, kp_string))
+
+        output_file.close()
+
+
+def find_all(target_string, substring):
+    start = 0
+    while True:
+        start = target_string.find(substring, start)
+        if start == -1: return
+        yield start
+        start += 1
--- a/eval/keras_metrics.py
+++ b/eval/keras_metrics.py
@ -0,0 +1,220 @@
+import keras
+import numpy as np
+import logging
+
+
+class MetricsCallback(keras.callbacks.Callback):
+
+    def __init__(self,val_x,val_y):
+        self.val_x = val_x
+        self.val_y = val_y
+        self.epoch = []
+        self.history = {}
+
+    def on_epoch_end(self, epoch, logs={}):
+
+        # Predict on the validation data
+        y_pred = self.model.predict(self.val_x)
+
+        precision = keras_precision(self.val_y,y_pred)
+        recall = keras_recall(self.val_y, y_pred)
+        f1 = keras_f1(self.val_y, y_pred)
+
+        print("")
+        print("###   Validation Scores   ###")
+        print("###")
+        print("### Epoch     : %s" % (epoch+1))
+        print("### Precision : %.4f" % precision)
+        print("### Recall    : %.4f" % recall)
+        print("### F1        : %.4f" % f1)
+        print("###                       ###")
+
+        self.epoch.append(epoch+1)
+        self.history.setdefault("precision", []).append(precision)
+        self.history.setdefault("recall", []).append(recall)
+        self.history.setdefault("f1", []).append(f1)
+
+
+class MetricsCallbackQA(keras.callbacks.Callback):
+
+    def __init__(self,val_x,val_y,batch_size = 128):
+        self.val_x = val_x
+        self.val_y = val_y
+        self.epoch = []
+        self.history = {}
+        self.batch_size = batch_size
+
+    def on_epoch_end(self, epoch, logs={}):
+
+        # Predict on the validation data
+        y_pred = self.model.predict(self.val_x,batch_size=self.batch_size,verbose=1)
+
+        precision = keras_precision_qa(self.val_y,y_pred)
+        recall = keras_recall_qa(self.val_y, y_pred)
+        f1 = keras_f1_qa(self.val_y, y_pred)
+
+        print("")
+        print("###   Validation Scores   ###")
+        print("###")
+        print("### Epoch     : %s" % (epoch+1))
+        print("### Precision : %.4f" % precision)
+        print("### Recall    : %.4f" % recall)
+        print("### F1        : %.4f" % f1)
+        print("###                       ###")
+
+        self.epoch.append(epoch+1)
+        self.history.setdefault("precision", []).append(precision)
+        self.history.setdefault("recall", []).append(recall)
+        self.history.setdefault("f1", []).append(f1)
+
+def keras_precision(y_true,y_pred) :
+
+    true_positives = 0
+    false_positives = 0
+
+    # reduce dimensionality
+    y_true_2d = np.argmax(y_true,axis=2)
+    y_pred_2d = np.argmax(y_pred,axis=2)
+
+    y_true_indices = {}
+
+    for i in range(np.shape(y_true_2d)[0]):
+        doc_true_indices = []
+        in_word = False
+
+        for j in range(np.shape(y_true_2d)[1]):
+            if y_true_2d[i][j] == 1 :
+                doc_true_indices.append(["%s" % j])
+                in_word = True
+            elif j > 0 and y_true_2d[i][j] == 2 and in_word:
+                doc_true_indices[len(doc_true_indices) -1].append(",%s" % j)
+            else:
+                in_word = False
+
+        y_true_indices[i] = doc_true_indices
+
+    y_pred_indices = {}
+
+    for i in range(np.shape(y_pred_2d)[0]):
+        doc_true_indices = []
+        in_word = False
+        for j in range(np.shape(y_pred_2d)[1]):
+
+            if y_pred_2d[i][j] == 1:
+                doc_true_indices.append(["%s" % j])
+                in_word = True
+            elif j > 0 and y_pred_2d[i][j] == 2 and in_word:
+                doc_true_indices[len(doc_true_indices) - 1].append(",%s" % j)
+            else :
+                in_word = False
+
+        y_pred_indices[i] = doc_true_indices
+
+    for i in range(len(y_pred_indices)) :
+        for kp in y_pred_indices[i]:
+            if kp in y_true_indices[i]:
+                true_positives += 1
+            else :
+                false_positives += 1
+
+    return (1.0 * true_positives) / (true_positives + false_positives) \
+        if true_positives + false_positives > 0 else 0
+
+def keras_recall(y_true,y_pred) :
+
+    true_positives = 0
+    false_positives = 0
+
+    # reduce dimensionality
+    y_true_2d = np.argmax(y_true,axis=2)
+    y_pred_2d = np.argmax(y_pred,axis=2)
+
+    y_true_indices = {}
+
+    for i in range(np.shape(y_true_2d)[0]):
+        doc_true_indices = []
+        in_word = False
+
+        for j in range(np.shape(y_true_2d)[1]):
+            if y_true_2d[i][j] == 1 :
+                doc_true_indices.append(["%s" % j])
+                in_word = True
+            elif j > 0 and y_true_2d[i][j] == 2 and in_word:
+                doc_true_indices[len(doc_true_indices) -1].append(",%s" % j)
+            else:
+                in_word = False
+
+        y_true_indices[i] = doc_true_indices
+
+    y_pred_indices = {}
+
+    for i in range(np.shape(y_pred_2d)[0]):
+        doc_true_indices = []
+        in_word = False
+        for j in range(np.shape(y_pred_2d)[1]):
+
+            if y_pred_2d[i][j] == 1:
+                doc_true_indices.append(["%s" % j])
+                in_word = True
+            elif j > 0 and y_pred_2d[i][j] == 2 and in_word:
+                doc_true_indices[len(doc_true_indices) - 1].append(",%s" % j)
+            else :
+                in_word = False
+
+        y_pred_indices[i] = doc_true_indices
+
+    for i in range(len(y_pred_indices)) :
+        for kp in y_pred_indices[i]:
+            if kp in y_true_indices[i]:
+                true_positives += 1
+
+    return (1.0 * true_positives) / sum(len(kps) for doc,kps in y_true_indices.items())
+
+
+def keras_f1(y_true,y_pred):
+    p = keras_precision(y_true,y_pred)
+    r = keras_recall(y_true,y_pred)
+    return (2*(p * r)) / (p + r) if p != 0 and r != 0 else 0
+
+
+def keras_precision_qa(y_true,y_pred) :
+
+    # Prepare data
+    if np.shape(y_pred)[1] == 2:
+        # If one-hot prediction...
+        y_true = np.argmax(y_true,axis=1)
+        y_pred = np.argmax(y_pred,axis=1)
+
+    else:
+        # If similarity-based...
+        y_pred = np.reshape(y_pred, np.shape(y_true))
+        y_pred = np.round(y_pred)
+
+    den = np.count_nonzero(y_pred)
+
+    if den == 0:
+        logging.log(logging.WARNING,"Network did not predict any positive sample")
+        return 0
+
+    return np.count_nonzero(np.in1d(np.where(y_pred), np.where(y_true))) / den
+
+
+def keras_recall_qa(y_true,y_pred) :
+    # Prepare data
+    if np.shape(y_pred)[1] == 2:
+        # If one-hot prediction...
+        y_true = np.argmax(y_true, axis=1)
+        y_pred = np.argmax(y_pred, axis=1)
+
+    else:
+        # If similarity-based...
+        y_pred = np.reshape(y_pred, np.shape(y_true))
+        y_pred = np.round(y_pred)
+
+    return np.count_nonzero(np.in1d(np.where(y_true), np.where(y_pred))) / np.count_nonzero(y_true)
+
+
+def keras_f1_qa(y_true,y_pred):
+    p = keras_precision_qa(y_true,y_pred)
+    r = keras_recall_qa(y_true,y_pred)
+    return (2*(p * r)) / (p + r) if p + r > 0 else 0
--- a/eval/metrics.py
+++ b/eval/metrics.py
@ -0,0 +1,82 @@
+from enum import Enum
+from nltk.stem import *
+
+stemMode = Enum("StemmerMode","none both results")
+
+
+def precision(reference,obtained,stem = stemMode.none):
+
+    true_positives = 0
+    false_positives = 0
+
+    for doc, reference_kps_tokens in reference.items():
+        obtained_kps_tokens = obtained[doc]
+
+        reference_kps = []
+        obtained_kps = []
+
+        for ref_tokens in reference_kps_tokens:
+
+            if stem == stemMode.both:
+                stemmer = PorterStemmer()
+                ref_tokens = [stemmer.stem(token) for token in ref_tokens]
+
+            reference_kp = ' '.join(ref_tokens)
+            reference_kps.append(reference_kp.lower())
+
+        for obt_tokens in obtained_kps_tokens:
+
+            if stem == stemMode.both or stem == stemMode.results:
+                stemmer = PorterStemmer()
+                obt_tokens = [stemmer.stem(token) for token in obt_tokens]
+
+            obt_string = ' '.join(obt_tokens).lower()
+            if obt_string not in obtained_kps:
+                # this is necessary, because if we stem the kps we may
+                # obtain duplicates
+                obtained_kps.append(obt_string)
+
+        for obt_string in obtained_kps:
+            if obt_string in reference_kps:
+                true_positives += 1
+            else:
+                false_positives += 1
+
+    return (true_positives * 1.0) / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
+
+
+def recall(reference,obtained,stem=stemMode.none):
+
+    true_positives = 0
+    total_reference = sum(len(kps) for doc,kps in reference.items())
+
+    for doc, reference_kps_tokens in reference.items():
+        obtained_kps_tokens = obtained[doc]
+
+        reference_kps = []
+
+        for ref_tokens in reference_kps_tokens:
+
+            if stem == stemMode.both:
+                stemmer = PorterStemmer()
+                ref_tokens = [stemmer.stem(token) for token in ref_tokens]
+
+            reference_kp = ' '.join(ref_tokens)
+            reference_kps.append(reference_kp)
+
+        for obt_tokens in obtained_kps_tokens:
+
+            if stem == stemMode.both or stem == stemMode.results:
+                stemmer = PorterStemmer()
+                obt_tokens = [stemmer.stem(token) for token in obt_tokens]
+
+            obt_string = ' '.join(obt_tokens)
+            if obt_string in reference_kps:
+                true_positives += 1
+                reference_kps.remove(obt_string)
+
+    return (true_positives * 1.0) / total_reference
+
+
+def f1(precision, recall):
+    return (2 * (precision * recall)) / (precision + recall) if precision + recall > 0 else 0
--- a/glove/vectors.txt
+++ b/glove/vectors.txt
--- a/nlp/chunker.py
+++ b/nlp/chunker.py
@ -0,0 +1,95 @@
+import nltk
+from nltk.chunk.regexp import *
+from nlp import tokenizer as tk
+
+
+KP_REGEX_1 = "<JJ|NN|NNP|NNS|NNPS>*<NN|NNP|NNS|NNPS|VB|VBG>"
+KP_REGEX_2 = "<JJ>?<NN|NNS>+<IN><NN|NNS>"
+KP_REGEX_3 = "<JJ|VBN>*<NN|NNS>"
+
+noun_phrase_grammar = r"""
+    NBAR:
+        {<NN.*|JJ>*<NN.*|VBG>}  # Nouns and Adjectives, terminated with Nouns or -ing verbs
+        
+    KP:
+        {<NBAR>}
+        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
+"""
+
+hulth_grammar = r"""
+    NBAR:
+        {<NN.*|JJ.*>*<NN.*|VBG>}  # Nouns and Adjectives, terminated with Nouns or -ing verbs
+
+    VBPART:
+        {<VBG|VBP><NBAR>}       # Verb in participle from, then nouns
+
+    COUNT:
+        {<CD><NBAR>}            # Numbers then nouns
+
+    NP:
+        {<NBAR><IN><NBAR>}
+"""
+
+hulth_labels = ['NP','NBAR','COUNT','VBPART']
+
+def extract_candidates_from_set(set,tokenizer):
+    """
+    Generates the candidate keyphrases for a document.
+
+    :param set: the training, test or validation set
+    :param tokenizer: which tokenizer to use
+    :return: a dictionary where each document is associated with its candidate keyphrases
+    """
+
+    candidates = {}
+    for doc, str in set.items() :
+        candidates[doc] = extract_candidates(str,tokenizer)
+
+    return candidates
+
+
+def extract_candidates(document,tokenizer):
+    """
+    Extracts the candidate keyphrases from a string.
+
+    :param document: the string to analyze
+    :param tokenizer: the tokenizer to use
+    :return: the list of candidate keyphrases for the input document
+    """
+
+    return extract_valid_tokens(tk.tokenize(document,tokenizer))
+
+
+def extract_valid_tokens(tokens):
+    """
+    Given a list of tokens, returns the subsets of such list which are potential keyphrases according to
+    the provided part-of-speech patterns.
+
+    :param document: the token list to analyze
+    :return: the list of candidate keyphrases for the input document
+    """
+
+    postagged_doc = nltk.pos_tag(tokens)
+
+    kp_rule_1 = ChunkRule(KP_REGEX_1,"")
+    kp_rule_2 = ChunkRule(KP_REGEX_2, "")
+    kp_rule_3 = ChunkRule(KP_REGEX_3, "")
+
+    #chunk_parser = RegexpChunkParser([kp_rule_1, kp_rule_2, kp_rule_3],
+    #                                 chunk_label="KP")
+
+    chunk_parser = RegexpParser(grammar=hulth_grammar)
+
+    tree = chunk_parser.parse(postagged_doc)
+
+    candidates = []
+
+    for subtree in tree.subtrees():
+        if subtree.label() in hulth_labels:
+            candidate = []
+            for leaf in subtree.leaves():
+                candidate.append(leaf[0])
+            if candidate not in candidates:
+                candidates.append(candidate)
+
+    return candidates
--- a/nlp/cleaner.py
+++ b/nlp/cleaner.py
@ -0,0 +1,38 @@
+import nltk
+
+# NLTK 使用 Penn Treebank 标签集
+# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
+ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
+ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]
+
+
+def clean_tokens(keyphrase):
+    """
+    Removes the tokens from the head and the tail of a keyphrase +
+    (passed as a token list) that do not match the allowed PoS tags.
+
+
+    :return: the cleaned keyphrase
+    """
+
+    keyphrase_pos = nltk.pos_tag(keyphrase)
+
+    start = 0
+
+    for start in range(len(keyphrase_pos)):
+        if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
+            start += 1
+        else:
+            break
+
+    end = len(keyphrase) - 1
+
+    for end in range(len(keyphrase_pos) - 1,start,-1):
+        if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
+            end -= 1
+        else:
+            break
+
+    return keyphrase[start:end+1]
+
+
--- a/nlp/dictionary.py
+++ b/nlp/dictionary.py
@ -0,0 +1,101 @@
+from collections import OrderedDict
+
+
+class Dictionary(object):
+    """Dictionary utility class. This class is a lightweight version of the Keras text preprocessing module
+    (see https://github.com/fchollet/keras/blob/master/keras/preprocessing/text.py), designed to work on
+    tokens instead of strings.
+
+    This class is used to build a dictionary that can in turn be used to fill an Embedding layer
+    with word embeddings.
+
+    Please note that `0` is a reserved index that won't be assigned to any word.
+    
+    The original keras.preprocessing.text module is licensed under the MIT license.
+    """
+
+    def __init__(self, num_words=None):
+
+        self.word_counts = OrderedDict()
+        self.word_index = {}
+        self.reverse_word_index = None
+        self.num_words = num_words
+        self.document_count = 0
+
+    def fit_on_texts(self, tokenized_documents):
+
+        for document in tokenized_documents:
+            self.document_count += 1
+
+            for w in document:
+                if w in self.word_counts:
+                    self.word_counts[w] += 1
+                else:
+                    self.word_counts[w] = 1
+
+        wcounts = list(self.word_counts.items())
+        wcounts.sort(key=lambda x: x[1], reverse=True)
+        sorted_voc = [wc[0] for wc in wcounts]
+        # note that index 0 is reserved, never assigned to an existing word
+        self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+
+    def texts_to_sequences(self, texts):
+        """
+        Transforms each text in texts in a sequence of integers.
+
+        Only top "num_words" most frequent words will be taken into account.
+
+        :param texts: A list of words
+        :return: A list of sequences.
+        """
+        texts_sequences = []
+        for text in texts:
+            texts_sequences.append(self.token_list_to_sequence(text))
+        return texts_sequences
+
+    def token_list_to_sequence(self, tokens):
+        """Transforms each text in texts in a sequence of integers.
+
+        Only top "num_words" most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.
+
+        # Arguments
+            tokens: A list of texts (strings).
+
+        # Yields
+            Yields individual sequences.
+        """
+        vect = []
+        for w in tokens:
+
+                i = self.word_index.get(w)
+                if i is not None:
+                    if self.num_words and i >= self.num_words:
+                        continue
+                    else:
+                        vect.append(i)
+        return vect
+
+    def tokens_to_words(self, tokens):
+        """
+        Utility that prints the words associated to the provided indices.
+
+        :param tokens: a list of integers
+        """
+
+        if not self.reverse_word_index:
+            self.build_reverse_word_index()
+
+        words = []
+
+        for token in tokens:
+            if token != 0:
+                words.append(self.reverse_word_index[token])
+
+        return words
+
+    def build_reverse_word_index(self):
+
+        self.reverse_word_index = {}
+        for key, value in self.word_index.items():
+            self.reverse_word_index[value] = key
--- a/nlp/tokenizer.py
+++ b/nlp/tokenizer.py
@ -0,0 +1,37 @@
+from enum import Enum
+import keras.preprocessing.text
+import nltk
+
+tokenizers = Enum("Tokenizers","nltk keras")
+
+
+def tokenize_set(documents,answers,tokenizer):
+
+    tokenized_docs = {}
+    for doc, str in documents.items():
+        tokenized_docs[doc] = tokenize(str, tokenizer)
+
+    tokenized_answers = {}
+    for doc, answers in answers.items():
+        for answer in answers :
+            if doc not in tokenized_answers:
+                tokenized_answers[doc] = [tokenize(answer,tokenizer)]
+            else:
+                tokenized_answers[doc].append(tokenize(answer,tokenizer))
+
+    return tokenized_docs,tokenized_answers
+
+def tokenize(string,tokenizer = tokenizers.keras):
+    """
+    Tokenizes a string using the selected tokenizer.
+    :param string: the string to tokenize
+    :param tokenizer: which tokenizer to use (nltk or keras)
+    :return: the list of tokens
+    """
+
+    if tokenizer == tokenizers.nltk:
+        return nltk.word_tokenize(string.lower())
+    elif tokenizer == tokenizers.keras:
+        return keras.preprocessing.text.text_to_word_sequence(string)
+    else:
+        raise NotImplementedError()
--- a/utils/glove.py
+++ b/utils/glove.py
@ -0,0 +1,25 @@
+import os,sys
+os.chdir(sys.path[0]) #相对路径
+
+import numpy as np
+import logging
+
+
+# 加载词向量
+def load_glove(glove_dir,size):
+    embeddings_index = {}
+    glove_path = ("/home/zhangxj/WorkFile/本科毕业设计/glove/vectors.txt")
+
+    logging.debug("Loading GloVe pre-trained embeddings from %s" % glove_path)
+
+    f = open(os.path.join(glove_dir, glove_path))
+    for line in f:
+        values = line.split()
+        word = values[0]
+        coefs = np.asarray(values[1:], dtype='float32')
+        embeddings_index[word] = coefs
+    f.close()
+
+    logging.debug('Total embeddings found: %s.' % len(embeddings_index))
+
+    return embeddings_index
--- a/utils/info.py
+++ b/utils/info.py
@ -0,0 +1,14 @@
+import logging
+
+
+def log_versions():
+    import keras
+    logging.info("Keras version %s" % keras.__version__)
+    import numpy as np
+    logging.info("Numpy version %s" % np.__version__)
+    if keras.backend.backend() == 'theano':
+        import theano
+        logging.info("Theano version %s" % theano.__version__)
+    else:
+        import tensorflow
+        logging.info("Tensorflow version %s" % tensorflow.__version__)
--- a/utils/plots.py
+++ b/utils/plots.py
@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+
+
+def plot_accuracy(history) :
+    plt.plot(history.history['acc'])
+    plt.plot(history.history['val_acc'])
+    plt.title('Model Accuracy over epochs')
+    plt.ylabel('accuracy')
+    plt.xlabel('epoch')
+    plt.legend(['Training', 'Validation'], loc='upper left')
+    plt.show()
+
+
+def plot_loss(history) :
+    plt.plot(history.history['loss'])
+    plt.plot(history.history['val_loss'])
+    plt.title('Model loss over epochs')
+    plt.ylabel('loss')
+    plt.xlabel('epoch')
+    plt.legend(['Training', 'Validation'], loc='upper left')
+    plt.show()
+
+
+def plot_prf(history) :
+    plt.plot(history.history['precision'])
+    plt.plot(history.history['recall'])
+    plt.plot(history.history['f1'])
+    plt.title('P/R/F1 scores on validation set')
+    plt.ylabel('score')
+    plt.xlabel('epoch')
+    plt.legend(['Precision', 'Recall', 'F1'], loc='upper left')
+    plt.show()
--- a/utils/postprocessing.py
+++ b/utils/postprocessing.py
@ -0,0 +1,258 @@
+import itertools
+import numpy as np
+from nlp import chunker, cleaner
+
+
+def undo_sequential(output):
+    """
+    Transforms a 3D one-hot array of the type (documents,token,category)
+    in a 2D array of the type (documents,token_category).
+
+    :param output: a one-hot 3D array
+    :return: a 2D array
+    """
+    return np.argmax(output,axis=2)
+
+
+def get_words(docs, selections):
+    """
+    Gets the selected words in the provided documents.
+
+    :param docs: the document to analyze
+    :param selections: the words selected in the documents
+    :return: a dictionary with the documents and for each a list of
+    the selected words
+    """
+    i = 0
+    obtained_words = {}
+    for doc, words in docs.items():
+        k = 0
+        obtained_words_doc = []
+        in_word = False
+        for token in selections[i]:
+            if token == 1 and k < len(words):
+                obtained_words_doc.append([words[k]])
+                in_word = True
+            elif token == 2 and k < len(words) and in_word:
+                obtained_words_doc[len(obtained_words_doc) - 1].append(words[k])
+            else:
+                in_word = False
+            k += 1
+
+        # remove duplicate selections
+        obtained_words_doc.sort()
+        obtained_words_doc = list(w for w, _ in itertools.groupby(obtained_words_doc))
+        obtained_words[doc] = obtained_words_doc
+        i += 1
+
+    return obtained_words
+
+
+def get_top_words(docs,output,words_limit):
+    """
+    Gets the selected words in the provided documents.
+
+    :param docs: the document to analyze
+    :param output: the output of the network
+    :param words_limit: how many words to extract
+    :return: a dictionary with the documents and for each a list of
+    the selected words
+    """
+
+    selections = undo_sequential(output)
+
+    i = 0
+    obtained_words = {}
+    for doc, words in docs.items():
+        k = 0
+        obtained_words_doc = []
+        obtained_words_weights = []
+        in_word = False
+        for token in selections[i]:
+            if token == 1 and k < len(words):
+                obtained_words_doc.append([words[k]])
+                obtained_words_weights.append(output[i,k,1])
+                in_word = True
+            elif token == 2 and k < len(words) and in_word:
+                obtained_words_doc[len(obtained_words_doc) - 1].append(words[k])
+                obtained_words_weights[len(obtained_words_weights) - 1] = \
+                    obtained_words_weights[len(obtained_words_weights) - 1] + \
+                    ((output[i,k,2] - obtained_words_weights[len(obtained_words_weights) - 1]) /
+                     (len(obtained_words_doc[len(obtained_words_doc) - 1])))
+
+                # We calculate the average at the nth step this way:
+                # If A_i is the average at the ith step and x_i is the ith item of the sequence, then
+                # A_k = A_{k-1} + ((x_k - A_{k-1}) / k)
+
+            else:
+                in_word = False
+            k += 1
+
+        if words_limit < len(obtained_words_doc):
+            # there are more selections than the limit! cut them
+
+            obtained_words_and_scores = {}
+            for index, words in enumerate(obtained_words_doc):
+                obtained_words_and_scores[index] = obtained_words_weights[index]
+
+            sorted_words = sorted(obtained_words_and_scores, key=obtained_words_and_scores.__getitem__,reverse=True)
+
+
+            ok_obtained_words = []
+            cur_word = 0
+            while len(ok_obtained_words) < words_limit and cur_word < len(sorted_words):
+                if obtained_words_doc[sorted_words[cur_word]] not in ok_obtained_words:
+                    ok_obtained_words.append(obtained_words_doc[sorted_words[cur_word]])
+                cur_word += 1
+            obtained_words_doc = ok_obtained_words
+
+        else:
+            # just remove duplicate selections
+            obtained_words_doc.sort()
+            obtained_words_doc = list(w for w, _ in itertools.groupby(obtained_words_doc))
+
+        obtained_words[doc] = obtained_words_doc
+        i += 1
+
+    return obtained_words
+
+
+def get_valid_patterns(answer_set):
+    """
+    Remove the answers from a set that do NOT match the keyphrase part-of-speech patterns.
+
+    :param answer_set: a dictionary of documents and tokenized keyphrases
+    :return: a dictionary of documents and tokenized keyphrases that match the part-of-speech patterns
+    """
+
+    doc_filtered = {}
+
+    for doc, kps in answer_set.items():
+        filtered_keyphrases = []
+        for kp in kps:
+            for valid_kp in chunker.extract_valid_tokens(kp):
+                filtered_keyphrases.append(valid_kp)
+        
+        # remove duplicates
+        filtered_keyphrases.sort()
+        filtered_keyphrases = list(w for w, _ in itertools.groupby(filtered_keyphrases))
+        doc_filtered[doc] = filtered_keyphrases
+
+    return doc_filtered
+
+
+def clean_answers(answer_set):
+    """
+    Cleans the keyphrases by removing the tokens that are not PoS tagged with the allowed tags.
+
+    :param answer_set: a dictionary of documents and tokenized keyphrases
+    :return: a dictionary of documents and their cleaned tokenized keyphrases
+    """
+    doc_filtered = {}
+
+    for doc, kps in answer_set.items():
+        filtered_keyphrases = []
+        for kp in kps:
+            clean_kp = cleaner.clean_tokens(kp)
+            if clean_kp:
+                filtered_keyphrases.append(clean_kp)
+
+        # 去重
+        filtered_keyphrases.sort()
+        filtered_keyphrases = list(w for w, _ in itertools.groupby(filtered_keyphrases))
+        doc_filtered[doc] = filtered_keyphrases
+
+    return doc_filtered
+
+
+def get_answers(candidate_tokens,predict_set,predict_result,dictionary):
+    """
+    Build the dictionary of the selected answer for a QA-based network.
+
+    :param candidate_tokens: the dictionary of the documents and their candidate KPs
+    :param predict_set: the input of the network
+    :param predict_result: the output of the network
+    :param dictionary: the previously-fit word index
+    :return: the dictionary of the selected KPs
+    """
+
+    # Here the ideas is: we go through the dictionary of the candidates, we find the corresponding
+    # model input, and we add the candidate to the answer set if the model predicted class 1 (i.e. that the candidate
+    # was a correct KP
+
+    # First, get the actual predictions:
+    if np.shape(predict_result)[1] == 1:
+        # If we have just 1 output neuron, reshape and put make the output in 0,1 values
+        predictions_flattened = np.round(np.reshape(predict_result,np.shape(predict_result)[0]))
+    else:
+        # If we're working with categorical output, flatten the (num_samples,2) array to a (num_samples) one
+        # This way transform a 2D array e.g. [[0.6,0.4] ... [0.2,0.8]] to a 1D array e.g. [0...1]
+        predictions_flattened = np.argmax(predict_result, axis=1)
+
+    i = 0
+    answers = {}
+    for doc_id, candidate_list in candidate_tokens.items() :
+        answers[doc_id] = []
+        for candidate in candidate_list:
+
+            # Sanity check: was the order preserved?
+            assert candidate == dictionary.tokens_to_words(predict_set[1][i])
+
+            if predictions_flattened[i] == 1 :
+                answers[doc_id].append(candidate)
+
+            i += 1
+
+    return answers
+
+
+def get_top_answers(candidate_tokens,predict_set,predict_result,dictionary,limit):
+    """
+    Build the dictionary of the selected answer for a QA-based network.
+
+    :param candidate_tokens: the dictionary of the documents and their candidate KPs
+    :param predict_set: the input of the network
+    :param predict_result: the output of the network
+    :param dictionary: the previously-fit word index
+    :return: the dictionary of the selected KPs
+    """
+
+    # Here the ideas is: we go through the dictionary of the candidates, we find the corresponding
+    # model input, and we add the candidate to the answer set if the model predicted class 1 (i.e. that the candidate
+    # was a correct KP
+
+    # First, get the actual predictions:
+    if np.shape(predict_result)[1] == 1:
+        # If we have just 1 output neuron, reshape and put the output in 0,1 values
+        predictions_flattened = np.round(np.reshape(predict_result,np.shape(predict_result)[0]))
+    else:
+        # If we're working with categorical output, flatten the (num_samples,2) array to a (num_samples) one
+        # This way transform a 2D array e.g. [[0.6,0.4] ... [0.2,0.8]] to a 1D array e.g. [0...1]
+        predictions_flattened = np.argmax(predict_result, axis=1)
+
+    i = 0
+    answers = {}
+    scores = {}
+    for doc_id, candidate_list in candidate_tokens.items() :
+        answers[doc_id] = []
+        scores[doc_id] = []
+        for candidate in candidate_list:
+
+            # Sanity check: was the order preserved?
+            assert candidate == dictionary.tokens_to_words(predict_set[1][i])
+
+            if predictions_flattened[i] == 1 :
+                answers[doc_id].append(candidate)
+                if np.shape(predict_result)[1] == 1:
+                    scores[doc_id].append(predict_result[i][0])
+                else:
+                    scores[doc_id].append(predict_result[i][1])
+
+            i += 1
+
+        if len(answers[doc_id]) > limit :
+            answers[doc_id] = [x for _,x in sorted(zip(scores[doc_id],answers[doc_id]),reverse=True)][:limit]
+
+    return answers
+
+
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@ -0,0 +1,560 @@
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import np_utils
+from utils import glove
+from nlp import dictionary as dict
+import logging
+import numpy as np
+import random
+
+
+def prepare_answer(train_doc, train_answer, train_candidates,
+                   test_doc, test_answer, test_candidates,
+                   val_doc=None, val_answer=None, val_candidates=None,
+                   max_document_length=1000,
+                   max_answer_length=20,
+                   max_vocabulary_size=50000,
+                   embeddings_size=50):
+    """
+        Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
+        previously for the training, test and validation sets as candidate for all three sets.
+
+        :param train_doc: the training documents
+        :param train_answer: the KPs for the training documents
+        :param train_candidates: the candidate KPs for the training documents
+        :param test_doc: the test documents
+        :param test_answer: the KPs for the test documents
+        :param test_candidates: the candidate KPs for the test documents
+        :param val_doc: the validation documents (can be None)
+        :param val_answer: the KPs for the validation documents (can be None)
+        :param val_candidates: the candidate KPs for the validation documents (can be None)
+        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
+        :param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
+        :param max_vocabulary_size: the maximum size of the vocabulary to use
+        (i.e. we keep only the top max_vocabulary_size words)
+        :param embeddings_size: the size of the GLoVE embeddings to use
+        :return:  a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
+        test and validation set, and an embedding matrix for an Embedding layer
+        """
+
+    # Prepare validation return data
+    val_q = None
+    val_a = None
+    val_y = None
+
+    # Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
+    train_q = []
+    test_q = []
+    train_a = []
+    test_a = []
+    train_y = []
+    test_y = []
+
+    if val_doc and val_answer:
+        val_q = []
+        val_a = []
+        val_y = []
+
+    documents_full = []
+    for key, doc in train_doc.items():
+        documents_full.append(token for token in doc)
+    for key, doc in test_doc.items():
+        documents_full.append(token for token in doc)
+
+    if val_doc and val_answer:
+        for key, doc in val_doc.items():
+            documents_full.append(token for token in doc)
+
+    logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
+
+    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
+    dictionary.fit_on_texts(documents_full)
+
+    logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
+
+    # Pair up each document with a candidate keyphrase and its truth value
+    for key, document in train_doc.items():
+        doc_sequence = dictionary.token_list_to_sequence(document)
+        for kp in train_candidates[key]:
+            train_q.append(doc_sequence)
+            train_a.append(dictionary.token_list_to_sequence(kp))
+            train_y.append([0, 1] if kp in train_answer[key] else [1, 0])
+
+    for key, document in test_doc.items():
+        doc_sequence = dictionary.token_list_to_sequence(document)
+        for kp in test_candidates[key]:
+            test_q.append(doc_sequence)
+            test_a.append(dictionary.token_list_to_sequence(kp))
+            test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
+
+    if val_doc and val_answer:
+        for key, document in val_doc.items():
+            doc_sequence = dictionary.token_list_to_sequence(document)
+            for kp in val_candidates[key]:
+                val_q.append(doc_sequence)
+                val_a.append(dictionary.token_list_to_sequence(kp))
+                val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
+
+    logging.debug("Longest training document   : %s tokens" % len(max(train_q, key=len)))
+    logging.debug("Longest training answer     : %s tokens" % len(max(train_a, key=len)))
+    logging.debug("Longest test document       : %s tokens" % len(max(test_q, key=len)))
+    logging.debug("Longest test answer         : %s tokens" % len(max(test_a, key=len)))
+    if val_doc and val_answer:
+        logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
+        logging.debug("Longest validation answer   : %s tokens" % len(max(val_a, key=len)))
+
+    train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
+    train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
+
+    test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
+    test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
+
+    if val_doc and val_answer:
+        val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
+        val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
+
+    logging.debug("Training set documents size   : %s", np.shape(train_q))
+    logging.debug("Training set answers size     : %s", np.shape(train_a))
+    logging.debug("Test set documents size       : %s", np.shape(test_q))
+    logging.debug("Test set answers size         : %s ", np.shape(test_a))
+
+    if val_doc and val_answer:
+        logging.debug("Validation set documents size : %s", np.shape(val_q))
+        logging.debug("Validation set answers size   : %s ", np.shape(val_a))
+
+    # prepare the matrix for the embedding layer
+    word_index = dictionary.word_index
+    embeddings_index = glove.load_glove('', embeddings_size)
+
+    num_words = min(max_vocabulary_size, 1 + len(word_index))
+
+    logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
+
+    embedding_matrix = np.zeros((num_words, embeddings_size))
+    for word, i in word_index.items():
+        if i >= num_words:
+            continue
+        embedding_vector = embeddings_index.get(word)
+        if embedding_vector is not None:
+            # words not found in embedding index will be all-zeros.
+            embedding_matrix[i] = embedding_vector
+
+    return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, embedding_matrix, dictionary
+
+
+def prepare_answer_2(train_doc, train_answer, train_candidates,
+                     test_doc, test_answer, test_candidates,
+                     val_doc=None, val_answer=None, val_candidates=None,
+                     max_document_length=1000,
+                     max_answer_length=20,
+                     max_vocabulary_size=50000,
+                     embeddings_size=50):
+    """
+        Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
+        previously for the test and validation sets as candidate for these sets, and mix the correct answers with
+        wrong patterns on the training set to build in order to have balanced data for training.
+
+        :param train_doc: the training documents
+        :param train_answer: the KPs for the training documents
+        :param train_candidates: the candidate KPs for the training documents
+        :param test_doc: the test documents
+        :param test_answer: the KPs for the test documents
+        :param test_candidates: the candidate KPs for the test documents
+        :param val_doc: the validation documents (can be None)
+        :param val_answer: the KPs for the validation documents (can be None)
+        :param val_candidates: the candidate KPs for the validation documents (can be None)
+        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
+        :param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
+        :param max_vocabulary_size: the maximum size of the vocabulary to use
+        (i.e. we keep only the top max_vocabulary_size words)
+        :param embeddings_size: the size of the GLoVE embeddings to use
+        :return:  a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
+        test and validation set, and an embedding matrix for an Embedding layer
+        """
+
+    # Prepare validation return data
+    val_q = None
+    val_a = None
+    val_y = None
+
+    val_q_balanced = None
+    val_a_balanced = None
+    val_y_balanced = None
+
+    # Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
+    train_q = []
+    test_q = []
+    train_a = []
+    test_a = []
+    train_y = []
+    test_y = []
+
+    if val_doc and val_answer:
+        val_q = []
+        val_a = []
+        val_y = []
+        val_q_balanced = []
+        val_a_balanced = []
+        val_y_balanced = []
+
+    documents_full = []
+    for key, doc in train_doc.items():
+        documents_full.append(token for token in doc)
+    for key, doc in test_doc.items():
+        documents_full.append(token for token in doc)
+
+    if val_doc and val_answer:
+        for key, doc in val_doc.items():
+            documents_full.append(token for token in doc)
+
+    logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
+
+    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
+    dictionary.fit_on_texts(documents_full)
+
+    logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
+
+    # Pair up each document with a candidate keyphrase and its truth value
+    for key, document in train_doc.items():
+        doc_sequence = dictionary.token_list_to_sequence(document)
+
+        # select wrong candidates (possibly, in same quantity as good answers)
+        wrong_candidates = list(train_candidates[key])
+        for answer in train_answer[key]:
+            if answer in wrong_candidates:
+                wrong_candidates.remove(answer)
+
+        while len(wrong_candidates) > len(train_answer[key]):
+            random_candidate = random.choice(wrong_candidates)
+            wrong_candidates.remove(random_candidate)
+
+        # append wrong candidates
+        for kp in wrong_candidates:
+            train_q.append(doc_sequence)
+            train_a.append(dictionary.token_list_to_sequence(kp))
+            train_y.append([1, 0])
+
+        # append true answers
+        for kp in train_answer[key]:
+            train_q.append(doc_sequence)
+            train_a.append(dictionary.token_list_to_sequence(kp))
+            train_y.append([0, 1])
+
+    if val_doc and val_answer:
+        for key, document in val_doc.items():
+            doc_sequence = dictionary.token_list_to_sequence(document)
+
+            # select wrong candidates (possibly, in same quantity as good answers)
+            wrong_candidates = list(val_candidates[key])
+            for answer in val_answer[key]:
+                if answer in wrong_candidates:
+                    wrong_candidates.remove(answer)
+
+            while len(wrong_candidates) > len(val_answer[key]):
+                random_candidate = random.choice(wrong_candidates)
+                wrong_candidates.remove(random_candidate)
+
+            # append wrong candidates
+            for kp in wrong_candidates:
+                val_q_balanced.append(doc_sequence)
+                val_a_balanced.append(dictionary.token_list_to_sequence(kp))
+                val_y_balanced.append([1, 0])
+
+            # append true answers
+            for kp in val_answer[key]:
+                val_q_balanced.append(doc_sequence)
+                val_a_balanced.append(dictionary.token_list_to_sequence(kp))
+                val_y_balanced.append([0, 1])
+
+    # for the other sets, just pick the auto-generated candidates
+    for key, document in test_doc.items():
+        doc_sequence = dictionary.token_list_to_sequence(document)
+        for kp in test_candidates[key]:
+            test_q.append(doc_sequence)
+            test_a.append(dictionary.token_list_to_sequence(kp))
+            test_y.append([0, 1] if kp in test_answer[key] else [1, 0])
+
+    if val_doc and val_answer:
+        for key, document in val_doc.items():
+            doc_sequence = dictionary.token_list_to_sequence(document)
+            for kp in val_candidates[key]:
+                val_q.append(doc_sequence)
+                val_a.append(dictionary.token_list_to_sequence(kp))
+                val_y.append([0, 1] if kp in val_answer[key] else [1, 0])
+
+    logging.debug("Longest training document            : %s tokens" % len(max(train_q, key=len)))
+    logging.debug("Longest training answer              : %s tokens" % len(max(train_a, key=len)))
+    logging.debug("Longest test document                : %s tokens" % len(max(test_q, key=len)))
+    logging.debug("Longest test answer                  : %s tokens" % len(max(test_a, key=len)))
+    if val_doc and val_answer:
+        logging.debug("Longest validation document          : %s tokens" % len(max(val_q, key=len)))
+        logging.debug("Longest validation answer            : %s tokens" % len(max(val_a, key=len)))
+        logging.debug("Longest balanced validation document : %s tokens" % len(max(val_q, key=len)))
+        logging.debug("Longest balanced validation answer   : %s tokens" % len(max(val_a, key=len)))
+
+    train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
+    train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))
+
+    test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
+    test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))
+
+    if val_doc and val_answer:
+        val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
+        val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
+        val_q_balanced = np.asarray(pad_sequences(val_q_balanced, maxlen=max_document_length, padding='post', truncating='post'))
+        val_a_balanced = np.asarray(pad_sequences(val_a_balanced, maxlen=max_answer_length, padding='post', truncating='post'))
+
+    logging.debug("Training set documents size            : %s", np.shape(train_q))
+    logging.debug("Training set answers size              : %s", np.shape(train_a))
+    logging.debug("Test set documents size                : %s", np.shape(test_q))
+    logging.debug("Test set answers size                  : %s ", np.shape(test_a))
+
+    if val_doc and val_answer:
+        logging.debug("Validation set documents size          : %s", np.shape(val_q))
+        logging.debug("Validation set answers size            : %s ", np.shape(val_a))
+        logging.debug("Balanced Validation set documents size : %s", np.shape(val_q_balanced))
+        logging.debug("Balanced Validation set answers size   : %s ", np.shape(val_a_balanced))
+
+    # prepare the matrix for the embedding layer
+    word_index = dictionary.word_index
+    embeddings_index = glove.load_glove('', embeddings_size)
+
+    num_words = min(max_vocabulary_size, 1 + len(word_index))
+
+    logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
+
+    embedding_matrix = np.zeros((num_words, embeddings_size))
+    for word, i in word_index.items():
+        if i >= num_words:
+            continue
+        embedding_vector = embeddings_index.get(word)
+        if embedding_vector is not None:
+            # words not found in embedding index will be all-zeros.
+            embedding_matrix[i] = embedding_vector
+
+    return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, \
+           [val_q_balanced, val_a_balanced], val_y_balanced, embedding_matrix, dictionary
+
+
+def prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
+                       max_document_length=35000,
+                       max_vocabulary_size=5000,
+                       embeddings_size=50,
+                       stem_test = False):
+    """
+        Prepares a dataset for use by a sequential, categorical model.
+
+        :param train_doc: the training documents
+        :param train_answer: the KPs for the training documents
+        :param test_doc: the test documents
+        :param test_answer: the KPs for the test documents
+        :param val_doc: the validation documents (can be None)
+        :param val_answer: the KPs for the validation documents (can be None)
+        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
+        :param max_vocabulary_size: the maximum size of the vocabulary to use
+        (i.e. we keep only the top max_vocabulary_size words)
+        :param embeddings_size: the size of the GLoVE embeddings to use
+        :param stem_test: set the value to True if the test set answers are stemmed
+        :return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
+        test and validation set, and an embedding matrix for an Embedding layer
+        """
+
+    train_answer_seq = make_sequential(train_doc, train_answer)
+
+    if not stem_test:
+        test_answer_seq = make_sequential(test_doc, test_answer)
+    else:
+        import copy
+        stemmed_test_doc = copy.deepcopy(test_doc)
+        stemmed_test_doc = stem_dataset(stemmed_test_doc)
+        test_answer_seq = make_sequential(stemmed_test_doc,test_answer)
+
+    # Prepare validation return data
+    val_x = None
+    val_y = None
+
+    if val_doc and val_answer:
+        val_answer_seq = make_sequential(val_doc, val_answer)
+
+    # Transform the documents to sequence
+    documents_full = []
+    train_y = []
+    test_y = []
+
+    if val_doc and val_answer:
+        val_y = []
+
+    for key, doc in train_doc.items():
+        documents_full.append(token for token in doc)
+        train_y.append(train_answer_seq[key])
+    for key, doc in test_doc.items():
+        documents_full.append(token for token in doc)
+        test_y.append(test_answer_seq[key])
+
+    if val_doc and val_answer:
+        for key, doc in val_doc.items():
+            documents_full.append(token for token in doc)
+            val_y.append(val_answer_seq[key])
+
+    logging.debug("Fitting dictionary on %s documents..." % len(documents_full))
+
+    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
+    dictionary.fit_on_texts(documents_full)
+
+    logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))
+
+    # Now we can prepare the actual input
+    train_x = dictionary.texts_to_sequences(train_doc.values())
+    test_x = dictionary.texts_to_sequences(test_doc.values())
+    if val_doc and val_answer:
+        val_x = dictionary.texts_to_sequences(val_doc.values())
+
+    logging.debug("Longest training document : %s tokens" % len(max(train_x, key=len)))
+    logging.debug("Longest test document :     %s tokens" % len(max(test_x, key=len)))
+    if val_doc and val_answer:
+        logging.debug("Longest validation document : %s tokens" % len(max(val_x, key=len)))
+
+    train_x = np.asarray(pad_sequences(train_x, maxlen=max_document_length, padding='post', truncating='post'))
+    train_y = pad_sequences(train_y, maxlen=max_document_length, padding='post', truncating='post')
+    train_y = make_categorical(train_y)
+
+    test_x = np.asarray(pad_sequences(test_x, maxlen=max_document_length, padding='post', truncating='post'))
+    test_y = pad_sequences(test_y, maxlen=max_document_length, padding='post', truncating='post')
+    test_y = make_categorical(test_y)
+
+    if val_doc and val_answer:
+        val_x = np.asarray(pad_sequences(val_x, maxlen=max_document_length, padding='post', truncating='post'))
+        val_y = pad_sequences(val_y, maxlen=max_document_length, padding='post', truncating='post')
+        val_y = make_categorical(val_y)
+
+    logging.debug("Training set samples size   : %s", np.shape(train_x))
+    logging.debug("Training set answers size   : %s", np.shape(train_y))
+    logging.debug("Test set samples size       : %s", np.shape(test_x))
+    logging.debug("Test set answers size       : %s ", np.shape(test_y))
+
+    if val_doc and val_answer:
+        logging.debug("Validation set samples size : %s", np.shape(val_x))
+        logging.debug("Validation set answers size : %s ", np.shape(val_y))
+
+    # prepare the matrix for the embedding layer
+    word_index = dictionary.word_index
+    embeddings_index = glove.load_glove('', embeddings_size)
+
+    num_words = min(max_vocabulary_size, 1 + len(word_index))
+
+    logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))
+
+    embedding_matrix = np.zeros((num_words, embeddings_size))
+    for word, i in word_index.items():
+        if i >= num_words:
+            continue
+        embedding_vector = embeddings_index.get(word)
+        if embedding_vector is not None:
+            # words not found in embedding index will be all-zeros.
+            embedding_matrix[i] = embedding_vector
+
+    return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix
+
+
+def make_sequential(documents, answers):
+    """
+    Transform an answer-based dataset (i.e. with a list of
+    documents and a list of keyphrases) to a sequential, ner-like
+    dataset, i.e. where the answer set for each document is composed
+    by the lists of the documents' tokens marked as non-keyphrase (0),
+    beginning of keyphrase (1) and inside-keyphrase (2).
+
+    For example, for the tokens
+
+    "I am a python developer since today."
+
+    If the keyphrases are "python developer" and "today"" the answer
+    set for these tokens is
+
+    "[0 0 0 1 2 0 1]"
+
+    :param documents: the list of documents
+    :param answers: the list of keyphrases
+    :return: the new answer set
+    """
+
+    seq_answers = {}
+
+    for key, document in documents.items():
+        doc_answers_set = answers[key]
+        '''
+        按关键字的长度排序。 我们首先处理较短的 KP
+        如果它们包含在更长的 KP 中，将简单地覆盖
+        短的配长的
+        '''
+        doc_answers_set.sort(key=lambda a: len(a))
+
+        '''
+        该字段将包含答案。
+        我们将它初始化为一个零列表，然后我们将填充它
+        1s 和 2s 之后
+       '''
+        doc_answers_seq = [0] * len(document)
+
+        for answer in doc_answers_set:
+            # 查找 KP 出现的第一个单词的位置
+            appearances = [i for i, word in enumerate(document) if word == answer[0]]
+            for idx in appearances:
+                is_kp = True
+                # 检查 KP 是否也从它的第二个词开始匹配
+                for i in range(1, len(answer)):
+
+                    if (i + idx) < len(document):
+                        is_kp = answer[i] == document[i + idx]
+                    else:
+                        # 文档结尾
+                        is_kp = False
+
+                # 如果我们找到了实际的 KP，请在输出列表中标记标记。
+                if is_kp:
+                    doc_answers_seq[idx] = 1
+                    for i in range(1, len(answer)):
+                        doc_answers_seq[idx + i] = 2
+
+
+        seq_answers[key] = doc_answers_seq
+
+    return seq_answers
+
+
+def make_categorical(x):
+    """
+    Transform a two-dimensional list into a 3-dimensional array. The 2nd
+    dimension of the input list becomes a one-hot 2D array, e.g.
+    if the input is [[1,2,0],...], the output will be
+    [[[0,1,0],[0,0,1],[1,0,0]],...]
+
+    :param x: a 2D-list
+    :return: a 3D-numpy array
+    """
+
+    # 类别数量
+    num_categories = max([item for sublist in x for item in sublist]) + 1
+
+    # numpy格式输出
+    new_x = np.zeros((len(x), len(x[0]), num_categories))
+
+    # 使用keras进行实际的分类转换
+    i = 0
+    for doc in x:
+        new_doc = np_utils.to_categorical(doc, num_classes=num_categories)
+        new_x[i] = new_doc
+        i += 1
+
+    return new_x
+
+
+def stem_dataset(dataset):
+
+    from nltk.stem import PorterStemmer
+    stemmer = PorterStemmer()
+
+    for key, tokens in dataset.items():
+        stemmed_tokens = [stemmer.stem(token) for token in tokens]
+        dataset[key] = stemmed_tokens
+
+    return dataset