39 lines
934 B
Python
39 lines
934 B
Python
|
import nltk
|
||
|
|
||
|
# NLTK 使用 Penn Treebank 标签集
|
||
|
# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
|
||
|
ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
|
||
|
ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]
|
||
|
|
||
|
|
||
|
def clean_tokens(keyphrase):
|
||
|
"""
|
||
|
Removes the tokens from the head and the tail of a keyphrase +
|
||
|
(passed as a token list) that do not match the allowed PoS tags.
|
||
|
|
||
|
|
||
|
:return: the cleaned keyphrase
|
||
|
"""
|
||
|
|
||
|
keyphrase_pos = nltk.pos_tag(keyphrase)
|
||
|
|
||
|
start = 0
|
||
|
|
||
|
for start in range(len(keyphrase_pos)):
|
||
|
if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
|
||
|
start += 1
|
||
|
else:
|
||
|
break
|
||
|
|
||
|
end = len(keyphrase) - 1
|
||
|
|
||
|
for end in range(len(keyphrase_pos) - 1,start,-1):
|
||
|
if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
|
||
|
end -= 1
|
||
|
else:
|
||
|
break
|
||
|
|
||
|
return keyphrase[start:end+1]
|
||
|
|
||
|
|