import nltk

# NLTK 使用 Penn Treebank 标签集
# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]


def clean_tokens(keyphrase):
    """
    Removes the tokens from the head and the tail of a keyphrase +
    (passed as a token list) that do not match the allowed PoS tags.


    :return: the cleaned keyphrase
    """

    keyphrase_pos = nltk.pos_tag(keyphrase)

    start = 0

    for start in range(len(keyphrase_pos)):
        if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
            start += 1
        else:
            break

    end = len(keyphrase) - 1

    for end in range(len(keyphrase_pos) - 1,start,-1):
        if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
            end -= 1
        else:
            break

    return keyphrase[start:end+1]