EnergyNewsKeyword/utils/preprocessing.py

from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from utils import glove
from nlp import dictionary as dict
import logging
import numpy as np
import random


def prepare_answer(train_doc, train_answer, train_candidates,
                   test_doc, test_answer, test_candidates,
                   val_doc=None, val_answer=None, val_candidates=None,
                   max_document_length=1000,
                   max_answer_length=20,
                   max_vocabulary_size=50000,
                   embeddings_size=50):
    """
        Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
        previously for the training, test and validation sets as candidate for all three sets.

        :param train_doc: the training documents
        :param train_answer: the KPs for the training documents
        :param train_candidates: the candidate KPs for the training documents
        :param test_doc: the test documents
        :param test_answer: the KPs for the test documents
        :param test_candidates: the candidate KPs for the test documents
        :param val_doc: the validation documents (can be None)
        :param val_answer: the KPs for the validation documents (can be None)
        :param val_candidates: the candidate KPs for the validation documents (can be None)
        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
        :param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
        :param max_vocabulary_size: the maximum size of the vocabulary to use
        (i.e. we keep only the top max_vocabulary_size words)
        :param embeddings_size: the size of the GLoVE embeddings to use
        :return:  a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
        test and validation set, and an embedding matrix for an Embedding layer
        """

    # Prepare validation return data
    val_q = None
    val_a = None
    val_y = None

    # Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
    train_q = []
    test_q = []
    train_a = []
    test_a = []
    train_y = []
    test_y = []

    if val_doc and val_answer:
        val_q = []
        val_a = []
        val_y = []

    documents_full = []
    for key, doc in train_doc.items():
        documents_full.append(token for token in doc)
    for key, doc in test_doc.items():
        documents_full.append(token for token in doc)

    if val_doc and val_answer:
        for key, doc in val_doc.items():
            documents_full.append(token for token in doc)

    logging.debug("Fitting dictionary on %s documents..." % len(documents_full))

    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
    dictionary.fit_on_texts(documents_full)

    logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))

    # Pair up each document with a candidate keyphrase and its truth value
    for key, document in train_doc.items():
        doc_sequence = dictionary.token_list_to_sequence(document)
        for kp in train_candidates[key]:
            train_q.append(doc_sequence)
            train_a.append(dictionary.token_list_to_sequence(kp))
            train_y.append([0, 1] if kp in train_answer[key] else [1, 0])

    for key, document in test_doc.items():
        doc_sequence = dictionary.token_list_to_sequence(document)
        for kp in test_candidates[key]:
            test_q.append(doc_sequence)
            test_a.append(dictionary.token_list_to_sequence(kp))
            test_y.append([0, 1] if kp in test_answer[key] else [1, 0])

    if val_doc and val_answer:
        for key, document in val_doc.items():
            doc_sequence = dictionary.token_list_to_sequence(document)
            for kp in val_candidates[key]:
                val_q.append(doc_sequence)
                val_a.append(dictionary.token_list_to_sequence(kp))
                val_y.append([0, 1] if kp in val_answer[key] else [1, 0])

    logging.debug("Longest training document   : %s tokens" % len(max(train_q, key=len)))
    logging.debug("Longest training answer     : %s tokens" % len(max(train_a, key=len)))
    logging.debug("Longest test document       : %s tokens" % len(max(test_q, key=len)))
    logging.debug("Longest test answer         : %s tokens" % len(max(test_a, key=len)))
    if val_doc and val_answer:
        logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len)))
        logging.debug("Longest validation answer   : %s tokens" % len(max(val_a, key=len)))

    train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
    train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))

    test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
    test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))

    if val_doc and val_answer:
        val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
        val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))

    logging.debug("Training set documents size   : %s", np.shape(train_q))
    logging.debug("Training set answers size     : %s", np.shape(train_a))
    logging.debug("Test set documents size       : %s", np.shape(test_q))
    logging.debug("Test set answers size         : %s ", np.shape(test_a))

    if val_doc and val_answer:
        logging.debug("Validation set documents size : %s", np.shape(val_q))
        logging.debug("Validation set answers size   : %s ", np.shape(val_a))

    # prepare the matrix for the embedding layer
    word_index = dictionary.word_index
    embeddings_index = glove.load_glove('', embeddings_size)

    num_words = min(max_vocabulary_size, 1 + len(word_index))

    logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))

    embedding_matrix = np.zeros((num_words, embeddings_size))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, embedding_matrix, dictionary


def prepare_answer_2(train_doc, train_answer, train_candidates,
                     test_doc, test_answer, test_candidates,
                     val_doc=None, val_answer=None, val_candidates=None,
                     max_document_length=1000,
                     max_answer_length=20,
                     max_vocabulary_size=50000,
                     embeddings_size=50):
    """
        Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
        previously for the test and validation sets as candidate for these sets, and mix the correct answers with
        wrong patterns on the training set to build in order to have balanced data for training.

        :param train_doc: the training documents
        :param train_answer: the KPs for the training documents
        :param train_candidates: the candidate KPs for the training documents
        :param test_doc: the test documents
        :param test_answer: the KPs for the test documents
        :param test_candidates: the candidate KPs for the test documents
        :param val_doc: the validation documents (can be None)
        :param val_answer: the KPs for the validation documents (can be None)
        :param val_candidates: the candidate KPs for the validation documents (can be None)
        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
        :param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
        :param max_vocabulary_size: the maximum size of the vocabulary to use
        (i.e. we keep only the top max_vocabulary_size words)
        :param embeddings_size: the size of the GLoVE embeddings to use
        :return:  a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
        test and validation set, and an embedding matrix for an Embedding layer
        """

    # Prepare validation return data
    val_q = None
    val_a = None
    val_y = None

    val_q_balanced = None
    val_a_balanced = None
    val_y_balanced = None

    # Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
    train_q = []
    test_q = []
    train_a = []
    test_a = []
    train_y = []
    test_y = []

    if val_doc and val_answer:
        val_q = []
        val_a = []
        val_y = []
        val_q_balanced = []
        val_a_balanced = []
        val_y_balanced = []

    documents_full = []
    for key, doc in train_doc.items():
        documents_full.append(token for token in doc)
    for key, doc in test_doc.items():
        documents_full.append(token for token in doc)

    if val_doc and val_answer:
        for key, doc in val_doc.items():
            documents_full.append(token for token in doc)

    logging.debug("Fitting dictionary on %s documents..." % len(documents_full))

    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
    dictionary.fit_on_texts(documents_full)

    logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))

    # Pair up each document with a candidate keyphrase and its truth value
    for key, document in train_doc.items():
        doc_sequence = dictionary.token_list_to_sequence(document)

        # select wrong candidates (possibly, in same quantity as good answers)
        wrong_candidates = list(train_candidates[key])
        for answer in train_answer[key]:
            if answer in wrong_candidates:
                wrong_candidates.remove(answer)

        while len(wrong_candidates) > len(train_answer[key]):
            random_candidate = random.choice(wrong_candidates)
            wrong_candidates.remove(random_candidate)

        # append wrong candidates
        for kp in wrong_candidates:
            train_q.append(doc_sequence)
            train_a.append(dictionary.token_list_to_sequence(kp))
            train_y.append([1, 0])

        # append true answers
        for kp in train_answer[key]:
            train_q.append(doc_sequence)
            train_a.append(dictionary.token_list_to_sequence(kp))
            train_y.append([0, 1])

    if val_doc and val_answer:
        for key, document in val_doc.items():
            doc_sequence = dictionary.token_list_to_sequence(document)

            # select wrong candidates (possibly, in same quantity as good answers)
            wrong_candidates = list(val_candidates[key])
            for answer in val_answer[key]:
                if answer in wrong_candidates:
                    wrong_candidates.remove(answer)

            while len(wrong_candidates) > len(val_answer[key]):
                random_candidate = random.choice(wrong_candidates)
                wrong_candidates.remove(random_candidate)

            # append wrong candidates
            for kp in wrong_candidates:
                val_q_balanced.append(doc_sequence)
                val_a_balanced.append(dictionary.token_list_to_sequence(kp))
                val_y_balanced.append([1, 0])

            # append true answers
            for kp in val_answer[key]:
                val_q_balanced.append(doc_sequence)
                val_a_balanced.append(dictionary.token_list_to_sequence(kp))
                val_y_balanced.append([0, 1])

    # for the other sets, just pick the auto-generated candidates
    for key, document in test_doc.items():
        doc_sequence = dictionary.token_list_to_sequence(document)
        for kp in test_candidates[key]:
            test_q.append(doc_sequence)
            test_a.append(dictionary.token_list_to_sequence(kp))
            test_y.append([0, 1] if kp in test_answer[key] else [1, 0])

    if val_doc and val_answer:
        for key, document in val_doc.items():
            doc_sequence = dictionary.token_list_to_sequence(document)
            for kp in val_candidates[key]:
                val_q.append(doc_sequence)
                val_a.append(dictionary.token_list_to_sequence(kp))
                val_y.append([0, 1] if kp in val_answer[key] else [1, 0])

    logging.debug("Longest training document            : %s tokens" % len(max(train_q, key=len)))
    logging.debug("Longest training answer              : %s tokens" % len(max(train_a, key=len)))
    logging.debug("Longest test document                : %s tokens" % len(max(test_q, key=len)))
    logging.debug("Longest test answer                  : %s tokens" % len(max(test_a, key=len)))
    if val_doc and val_answer:
        logging.debug("Longest validation document          : %s tokens" % len(max(val_q, key=len)))
        logging.debug("Longest validation answer            : %s tokens" % len(max(val_a, key=len)))
        logging.debug("Longest balanced validation document : %s tokens" % len(max(val_q, key=len)))
        logging.debug("Longest balanced validation answer   : %s tokens" % len(max(val_a, key=len)))

    train_q = np.asarray(pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post'))
    train_a = np.asarray(pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post'))

    test_q = np.asarray(pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post'))
    test_a = np.asarray(pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post'))

    if val_doc and val_answer:
        val_q = np.asarray(pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post'))
        val_a = np.asarray(pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post'))
        val_q_balanced = np.asarray(pad_sequences(val_q_balanced, maxlen=max_document_length, padding='post', truncating='post'))
        val_a_balanced = np.asarray(pad_sequences(val_a_balanced, maxlen=max_answer_length, padding='post', truncating='post'))

    logging.debug("Training set documents size            : %s", np.shape(train_q))
    logging.debug("Training set answers size              : %s", np.shape(train_a))
    logging.debug("Test set documents size                : %s", np.shape(test_q))
    logging.debug("Test set answers size                  : %s ", np.shape(test_a))

    if val_doc and val_answer:
        logging.debug("Validation set documents size          : %s", np.shape(val_q))
        logging.debug("Validation set answers size            : %s ", np.shape(val_a))
        logging.debug("Balanced Validation set documents size : %s", np.shape(val_q_balanced))
        logging.debug("Balanced Validation set answers size   : %s ", np.shape(val_a_balanced))

    # prepare the matrix for the embedding layer
    word_index = dictionary.word_index
    embeddings_index = glove.load_glove('', embeddings_size)

    num_words = min(max_vocabulary_size, 1 + len(word_index))

    logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))

    embedding_matrix = np.zeros((num_words, embeddings_size))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return [train_q, train_a], train_y, [test_q, test_a], test_y, [val_q, val_a], val_y, \
           [val_q_balanced, val_a_balanced], val_y_balanced, embedding_matrix, dictionary


def prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer,
                       max_document_length=35000,
                       max_vocabulary_size=5000,
                       embeddings_size=50,
                       stem_test = False):
    """
        Prepares a dataset for use by a sequential, categorical model.

        :param train_doc: the training documents
        :param train_answer: the KPs for the training documents
        :param test_doc: the test documents
        :param test_answer: the KPs for the test documents
        :param val_doc: the validation documents (can be None)
        :param val_answer: the KPs for the validation documents (can be None)
        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
        :param max_vocabulary_size: the maximum size of the vocabulary to use
        (i.e. we keep only the top max_vocabulary_size words)
        :param embeddings_size: the size of the GLoVE embeddings to use
        :param stem_test: set the value to True if the test set answers are stemmed
        :return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
        test and validation set, and an embedding matrix for an Embedding layer
        """

    train_answer_seq = make_sequential(train_doc, train_answer)

    if not stem_test:
        test_answer_seq = make_sequential(test_doc, test_answer)
    else:
        import copy
        stemmed_test_doc = copy.deepcopy(test_doc)
        stemmed_test_doc = stem_dataset(stemmed_test_doc)
        test_answer_seq = make_sequential(stemmed_test_doc,test_answer)

    # Prepare validation return data
    val_x = None
    val_y = None

    if val_doc and val_answer:
        val_answer_seq = make_sequential(val_doc, val_answer)

    # Transform the documents to sequence
    documents_full = []
    train_y = []
    test_y = []

    if val_doc and val_answer:
        val_y = []

    for key, doc in train_doc.items():
        documents_full.append(token for token in doc)
        train_y.append(train_answer_seq[key])
    for key, doc in test_doc.items():
        documents_full.append(token for token in doc)
        test_y.append(test_answer_seq[key])

    if val_doc and val_answer:
        for key, doc in val_doc.items():
            documents_full.append(token for token in doc)
            val_y.append(val_answer_seq[key])

    logging.debug("Fitting dictionary on %s documents..." % len(documents_full))

    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
    dictionary.fit_on_texts(documents_full)

    logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index))

    # Now we can prepare the actual input
    train_x = dictionary.texts_to_sequences(train_doc.values())
    test_x = dictionary.texts_to_sequences(test_doc.values())
    if val_doc and val_answer:
        val_x = dictionary.texts_to_sequences(val_doc.values())

    logging.debug("Longest training document : %s tokens" % len(max(train_x, key=len)))
    logging.debug("Longest test document :     %s tokens" % len(max(test_x, key=len)))
    if val_doc and val_answer:
        logging.debug("Longest validation document : %s tokens" % len(max(val_x, key=len)))

    train_x = np.asarray(pad_sequences(train_x, maxlen=max_document_length, padding='post', truncating='post'))
    train_y = pad_sequences(train_y, maxlen=max_document_length, padding='post', truncating='post')
    train_y = make_categorical(train_y)

    test_x = np.asarray(pad_sequences(test_x, maxlen=max_document_length, padding='post', truncating='post'))
    test_y = pad_sequences(test_y, maxlen=max_document_length, padding='post', truncating='post')
    test_y = make_categorical(test_y)

    if val_doc and val_answer:
        val_x = np.asarray(pad_sequences(val_x, maxlen=max_document_length, padding='post', truncating='post'))
        val_y = pad_sequences(val_y, maxlen=max_document_length, padding='post', truncating='post')
        val_y = make_categorical(val_y)

    logging.debug("Training set samples size   : %s", np.shape(train_x))
    logging.debug("Training set answers size   : %s", np.shape(train_y))
    logging.debug("Test set samples size       : %s", np.shape(test_x))
    logging.debug("Test set answers size       : %s ", np.shape(test_y))

    if val_doc and val_answer:
        logging.debug("Validation set samples size : %s", np.shape(val_x))
        logging.debug("Validation set answers size : %s ", np.shape(val_y))

    # prepare the matrix for the embedding layer
    word_index = dictionary.word_index
    embeddings_index = glove.load_glove('', embeddings_size)

    num_words = min(max_vocabulary_size, 1 + len(word_index))

    logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size))

    embedding_matrix = np.zeros((num_words, embeddings_size))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix


def make_sequential(documents, answers):
    """
    Transform an answer-based dataset (i.e. with a list of
    documents and a list of keyphrases) to a sequential, ner-like
    dataset, i.e. where the answer set for each document is composed
    by the lists of the documents' tokens marked as non-keyphrase (0),
    beginning of keyphrase (1) and inside-keyphrase (2).

    For example, for the tokens

    "I am a python developer since today."

    If the keyphrases are "python developer" and "today"" the answer
    set for these tokens is

    "[0 0 0 1 2 0 1]"

    :param documents: the list of documents
    :param answers: the list of keyphrases
    :return: the new answer set
    """

    seq_answers = {}

    for key, document in documents.items():
        doc_answers_set = answers[key]
        '''
        按关键字的长度排序。 我们首先处理较短的 KP
        如果它们包含在更长的 KP 中，将简单地覆盖
        短的配长的
        '''
        doc_answers_set.sort(key=lambda a: len(a))

        '''
        该字段将包含答案。
        我们将它初始化为一个零列表，然后我们将填充它
        1s 和 2s 之后
       '''
        doc_answers_seq = [0] * len(document)

        for answer in doc_answers_set:
            # 查找 KP 出现的第一个单词的位置
            appearances = [i for i, word in enumerate(document) if word == answer[0]]
            for idx in appearances:
                is_kp = True
                # 检查 KP 是否也从它的第二个词开始匹配
                for i in range(1, len(answer)):

                    if (i + idx) < len(document):
                        is_kp = answer[i] == document[i + idx]
                    else:
                        # 文档结尾
                        is_kp = False

                # 如果我们找到了实际的 KP，请在输出列表中标记标记。
                if is_kp:
                    doc_answers_seq[idx] = 1
                    for i in range(1, len(answer)):
                        doc_answers_seq[idx + i] = 2


        seq_answers[key] = doc_answers_seq

    return seq_answers


def make_categorical(x):
    """
    Transform a two-dimensional list into a 3-dimensional array. The 2nd
    dimension of the input list becomes a one-hot 2D array, e.g.
    if the input is [[1,2,0],...], the output will be
    [[[0,1,0],[0,0,1],[1,0,0]],...]

    :param x: a 2D-list
    :return: a 3D-numpy array
    """

    # 类别数量
    num_categories = max([item for sublist in x for item in sublist]) + 1

    # numpy格式输出
    new_x = np.zeros((len(x), len(x[0]), num_categories))

    # 使用keras进行实际的分类转换
    i = 0
    for doc in x:
        new_doc = np_utils.to_categorical(doc, num_classes=num_categories)
        new_x[i] = new_doc
        i += 1

    return new_x


def stem_dataset(dataset):

    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()

    for key, tokens in dataset.items():
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        dataset[key] = stemmed_tokens

    return dataset