import logging import os from nlp import tokenizer as tk class Dataset(object): """ An abstract class that represents a dataset. """ def __init__(self, name, path): self.path = path self.name = name self.test_documents = None self.test_answers = None self.train_documents = None self.train_answers = None self.validation_documents = None self.validation_answers = None logging.debug("初始化数据集 %s 文件夹路径 %s" % (self.name, self.path)) def __str__(self): return '数据集 %s 所在路径 %s' % (self.name, self.path) def _load_test_documents(self): """ Loads the test documents. :return: a list of documents. """ raise NotImplementedError def _load_test_answers(self): """ Loads the answers for the test documents. :return: a list of answers. """ raise NotImplementedError def _load_train_documents(self): """ Loads the train documents. :return: a list of documents. """ raise NotImplementedError def _load_train_answers(self): """ Loads the answers for the train documents. :return: a list of answers. """ raise NotImplementedError def _load_validation_documents(self): """ Loads the validation documents. :return: a list of documents. """ raise NotImplementedError def _load_validation_answers(self): """ Loads the answers for the validation documents. :return: a list of answers. """ raise NotImplementedError def load_test(self): """ Loads the test documents and their answers. :return: a tuple containing the test documents and the test answers. """ if not self.test_documents: self.test_documents = self._load_test_documents() if not self.test_answers: self.test_answers = self._load_test_answers() assert (len(self.test_documents) == len(self.test_answers)), \ "You have not enough (or too many) test answers for your documents!" logging.debug("为数据集加载测试集 %s" % self.name) return self.test_documents, self.test_answers def load_train(self): """ Loads the training documents and their answers. :return: a tuple containing the train documents and the training answers. """ if not self.train_documents: self.train_documents = self._load_train_documents() if not self.train_answers: self.train_answers = self._load_train_answers() assert (len(self.train_documents) == len(self.train_answers)), \ "You have not enough (or too many) train answers for your documents!" logging.debug("为数据集加载训练集 %s" % self.name) return self.train_documents, self.train_answers def load_validation(self): """ Loads the validation documents and their answers. :return: a tuple containing the validation documents and the training answers. """ if not self.validation_documents: self.validation_documents = self._load_validation_documents() if not self.validation_answers: self.validation_answers = self._load_validation_answers() assert (not self.validation_answers and not self.validation_answers) or \ (len(self.validation_documents) == len(self.validation_answers)), \ "You have not enough (or too many) validation answers for your documents!" logging.debug("为数据集加载验证集 %s" % self.name) return self.validation_documents, self.validation_answers class EnNews(Dataset): """ Dataset from Annette Hulth's "Improved Automatic Keyword Extraction Given More Linguistic Knowledge" Note: to make the results obtained with this dataset comparable to the ones described in Hulth's paper, only the "uncontrolled" terms are used. Full-text here: http://www.aclweb.org/anthology/W03-1028 """ def __init__(self, path): super().__init__("EnergyNews", path) def __load_documents(self, folder): """ Loads the documents in the .abstr files contained in the specified folder and puts them in a dictionary indexed by document id (i.e. the filename without the extension). :param folder: the folder containing the documents :return: a dictionary with the documents """ # This dictionary will contain the documents documents = {} for doc in os.listdir("%s/%s" % (self.path, folder)): if doc.endswith(".clr"): content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read() documents[doc[:doc.find('.')]] = content return documents def __load_answers(self, folder): """ Loads the answers contained in the .contr and .uncontr files and puts them in a dictionary indexed by document ID (i.e. the document name without the extension) :param folder: the folder containing the answer files :return: a dictionary with the answers """ # This dictionary will contain the answers answers = {} for doc in os.listdir("%s/%s" % (self.path, folder)): if doc.endswith(".key"): content = open(("%s/%s/%s" % (self.path, folder, doc)), "r").read() retrieved_answers = content.split(' ') doc_id = doc[:doc.find('.')] for answer in retrieved_answers: answer = answer.strip() # 移除字符串头尾指定的字符(默认为空格或换行符) if doc_id not in answers: answers[doc_id] = [answer] else: answers[doc_id].append(answer) return answers def _load_test_documents(self): return self.__load_documents("test") def _load_train_documents(self): return self.__load_documents("train") def _load_validation_documents(self): return self.__load_documents("validation") def _load_test_answers(self): return self.__load_answers("test") def _load_train_answers(self): return self.__load_answers("train") def _load_validation_answers(self): return self.__load_answers("validation")