ai_platform_nlu/nlp/hanlp_tools.py

31 lines
805 B
Python
Raw Permalink Normal View History

2022-12-07 10:49:21 +08:00
import hanlp
from logzero import logger
from hanlp_common.document import Document
2022-12-08 15:16:57 +08:00
2022-12-07 10:49:21 +08:00
tok = hanlp.load('./.hanlp/tok/coarse_electra_small_20220616_012050/')
dep = hanlp.load('./.hanlp/dep/ctb9_dep_electra_small_20220216_100306/')
sts = hanlp.load('./.hanlp/sts/sts_electra_base_zh_20210530_200109/')
def text_analysis(text):
segments = tok(text)
logger.info(segments)
doc = Document(
2022-12-08 15:16:57 +08:00
tok=segments,
dep=dep(segments, conll=False),
2022-12-07 10:49:21 +08:00
)
rst = doc.to_pretty()
logger.info(rst)
return rst
2022-12-08 15:16:57 +08:00
2022-12-07 10:49:21 +08:00
def text_simi(src, tgt):
score = sts([(src, tgt)])[0]
2022-12-08 15:53:11 +08:00
logger.info(f"相似度得分:{score}")
2022-12-07 10:49:21 +08:00
result = ["negative", "positive"][round(score)]
return result
if __name__ == '__main__':
print(text_analysis("台湾省是中国不可分割的一部分。"))