EnergyNewsKeyword/nlp/cleaner.py

39 lines
934 B
Python
Raw Normal View History

2023-04-19 13:16:27 +08:00
import nltk
# NLTK 使用 Penn Treebank 标签集
# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]
def clean_tokens(keyphrase):
"""
Removes the tokens from the head and the tail of a keyphrase +
(passed as a token list) that do not match the allowed PoS tags.
:return: the cleaned keyphrase
"""
keyphrase_pos = nltk.pos_tag(keyphrase)
start = 0
for start in range(len(keyphrase_pos)):
if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
start += 1
else:
break
end = len(keyphrase) - 1
for end in range(len(keyphrase_pos) - 1,start,-1):
if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
end -= 1
else:
break
return keyphrase[start:end+1]