EnergyNewsKeyword/datasets.py

import logging
import os

from nlp import tokenizer as tk


class Dataset(object):
    """
    An abstract class that represents a dataset.
    """

    def __init__(self, name, path):
        self.path = path
        self.name = name
        self.test_documents = None
        self.test_answers = None
        self.train_documents = None
        self.train_answers = None
        self.validation_documents = None
        self.validation_answers = None

        logging.debug("初始化数据集 %s 文件夹路径 %s" %
                      (self.name, self.path))

    def __str__(self):
        return '数据集 %s 所在路径 %s' % (self.name, self.path)

    def _load_test_documents(self):
        """
        Loads the test documents.

        :return: a list of documents.
        """
        raise NotImplementedError

    def _load_test_answers(self):
        """
        Loads the answers for the test documents.
        :return: a list of answers.
        """
        raise NotImplementedError

    def _load_train_documents(self):
        """
        Loads the train documents.

        :return: a list of documents.
        """
        raise NotImplementedError

    def _load_train_answers(self):
        """
        Loads the answers for the train documents.
        :return: a list of answers.
        """
        raise NotImplementedError

    def _load_validation_documents(self):
        """
        Loads the validation documents.

        :return: a list of documents.
        """
        raise NotImplementedError

    def _load_validation_answers(self):
        """
        Loads the answers for the validation documents.
        :return: a list of answers.
        """
        raise NotImplementedError

    def load_test(self):
        """
        Loads the test documents and their answers.
        :return: a tuple containing the test documents and the test answers.
        """

        if not self.test_documents:
            self.test_documents = self._load_test_documents()

        if not self.test_answers:
            self.test_answers = self._load_test_answers()

        assert (len(self.test_documents) == len(self.test_answers)), \
            "You have not enough (or too many) test answers for your documents!"

        logging.debug("为数据集加载测试集 %s" % self.name)

        return self.test_documents, self.test_answers

    def load_train(self):
        """
        Loads the training documents and their answers.
        :return: a tuple containing the train documents and the training answers.
        """
        if not self.train_documents:
            self.train_documents = self._load_train_documents()

        if not self.train_answers:
            self.train_answers = self._load_train_answers()

        assert (len(self.train_documents) == len(self.train_answers)), \
            "You have not enough (or too many) train answers for your documents!"

        logging.debug("为数据集加载训练集 %s" % self.name)

        return self.train_documents, self.train_answers

    def load_validation(self):
        """
        Loads the validation documents and their answers.
        :return: a tuple containing the validation documents and the training answers.
        """
        if not self.validation_documents:
            self.validation_documents = self._load_validation_documents()

        if not self.validation_answers:
            self.validation_answers = self._load_validation_answers()

        assert (not self.validation_answers and not self.validation_answers) or \
            (len(self.validation_documents) == len(self.validation_answers)), \
            "You have not enough (or too many) validation answers for your documents!"

        logging.debug("为数据集加载验证集 %s" % self.name)

        return self.validation_documents, self.validation_answers


class EnNews(Dataset):
    """
    Dataset from Annette Hulth's "Improved Automatic Keyword Extraction
    Given More Linguistic Knowledge"

    Note: to make the results obtained with this dataset comparable to
    the ones described in Hulth's paper, only the "uncontrolled" terms
    are used.

    Full-text here: http://www.aclweb.org/anthology/W03-1028
    """

    def __init__(self, path):
        super().__init__("EnergyNews", path)

    def __load_documents(self, folder):
        """
        Loads the documents in the .abstr files contained
        in the specified folder and puts them in a dictionary
        indexed by document id (i.e. the filename without the
        extension).

        :param folder: the folder containing the documents
        :return: a dictionary with the documents
        """

        # This dictionary will contain the documents
        documents = {}

        for doc in os.listdir("%s/%s" % (self.path, folder)):
            if doc.endswith(".clr"):
                content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
                documents[doc[:doc.find('.')]] = content

        return documents

    def __load_answers(self, folder):
        """
        Loads the answers contained in the .contr and .uncontr files
        and puts them in a dictionary indexed by document ID
        (i.e. the document name without the extension)
        :param folder: the folder containing the answer files
        :return: a dictionary with the answers
        """

        # This dictionary will contain the answers
        answers = {}

        for doc in os.listdir("%s/%s" % (self.path, folder)):
            if doc.endswith(".key"):
                content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read()
                retrieved_answers = content.split(' ')
                doc_id = doc[:doc.find('.')]
                for answer in retrieved_answers:
                    answer = answer.strip() # 移除字符串头尾指定的字符（默认为空格或换行符）
                    if doc_id not in answers:
                        answers[doc_id] = [answer]
                    else:
                        answers[doc_id].append(answer)

        return answers

    def _load_test_documents(self):
        return self.__load_documents("test")

    def _load_train_documents(self):
        return self.__load_documents("train")

    def _load_validation_documents(self):
        return self.__load_documents("validation")

    def _load_test_answers(self):
        return self.__load_answers("test")

    def _load_train_answers(self):
        return self.__load_answers("train")

    def _load_validation_answers(self):
        return self.__load_answers("validation")