EnergyNewsKeyword/nlp/cleaner.py

import nltk

# NLTK 使用 Penn Treebank 标签集
# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]
ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]


def clean_tokens(keyphrase):
    """
    Removes the tokens from the head and the tail of a keyphrase +
    (passed as a token list) that do not match the allowed PoS tags.


    :return: the cleaned keyphrase
    """

    keyphrase_pos = nltk.pos_tag(keyphrase)

    start = 0

    for start in range(len(keyphrase_pos)):
        if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:
            start += 1
        else:
            break

    end = len(keyphrase) - 1

    for end in range(len(keyphrase_pos) - 1,start,-1):
        if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:
            end -= 1
        else:
            break

    return keyphrase[start:end+1]
first commit 2023-04-19 13:16:27 +08:00			`import nltk`

			`# NLTK 使用 Penn Treebank 标签集`
			`# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html`
			`ALLOWED_TAGS_HEAD = ["NN","NNP","NNPS","NNS","VBN","VBG","JJ","JJR","JJS","RB","CD"]`
			`ALLOWED_TAGS_TAIL = ["NN","NNP","NNPS","NNS","VBG","CD",")"]`


			`def clean_tokens(keyphrase):`
			`"""`
			`Removes the tokens from the head and the tail of a keyphrase +`
			`(passed as a token list) that do not match the allowed PoS tags.`


			`:return: the cleaned keyphrase`
			`"""`

			`keyphrase_pos = nltk.pos_tag(keyphrase)`

			`start = 0`

			`for start in range(len(keyphrase_pos)):`
			`if not keyphrase_pos[start][1] in ALLOWED_TAGS_HEAD:`
			`start += 1`
			`else:`
			`break`

			`end = len(keyphrase) - 1`

			`for end in range(len(keyphrase_pos) - 1,start,-1):`
			`if not keyphrase_pos[end][1] in ALLOWED_TAGS_TAIL:`
			`end -= 1`
			`else:`
			`break`

			`return keyphrase[start:end+1]`