2025-03-19 21:02:49 +08:00
|
|
|
import os
|
|
|
|
import json
|
|
|
|
from flask import Flask, request, make_response
|
|
|
|
from logzero import logger
|
|
|
|
from functools import lru_cache
|
|
|
|
|
|
|
|
from utils import *
|
|
|
|
from local_encoder import EmbeddingModel
|
|
|
|
|
|
|
|
# current_path = os.path.dirname(os.path.abspath(__file__)) # for local
|
|
|
|
current_path = os.getcwd() # for docker
|
|
|
|
logger.info(f"{current_path}")
|
|
|
|
|
|
|
|
app = Flask(__name__)
|
|
|
|
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
|
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
|
|
|
|
|
|
|
|
|
|
|
|
path = "/home/zhangxj/models/bge-large-zh-v1.5"
|
|
|
|
model = EmbeddingModel(path)
|
|
|
|
|
|
|
|
@lru_cache()
|
2025-03-21 16:09:46 +08:00
|
|
|
def process_and_embed(sentence):
|
|
|
|
# 字符串全部转化为中文处理
|
|
|
|
sentence = translate(sentence)
|
|
|
|
|
|
|
|
# 中文预处理
|
|
|
|
clean_text = preprocess_zh(sentence)
|
|
|
|
processed_text = get_noun_zh(clean_text)
|
|
|
|
|
2025-03-19 21:02:49 +08:00
|
|
|
# 如果处理后为空,使用原始文本
|
|
|
|
if not processed_text.strip():
|
|
|
|
processed_text = sentence
|
|
|
|
|
|
|
|
# 获取向量编码
|
|
|
|
embeddings = model.get_embeddings(processed_text)
|
|
|
|
return embeddings
|
|
|
|
|
|
|
|
@app.route('/embedding/', methods=["POST"])
|
|
|
|
def run_cls():
|
|
|
|
resp_info = dict()
|
|
|
|
if request.method == "POST":
|
|
|
|
sentences = request.json.get('sentences')
|
|
|
|
if sentences is not None and len(sentences) != 0:
|
|
|
|
logger.info(sentences)
|
|
|
|
resp_info["code"] = 200
|
|
|
|
resp_info["data"] = process_and_embed(sentences)
|
|
|
|
else:
|
|
|
|
resp_info["msg"] = "Input is None, please check !"
|
|
|
|
resp_info["code"] = 406
|
|
|
|
resp = make_response(json.dumps(resp_info))
|
|
|
|
resp.status_code = 200
|
|
|
|
return resp
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
app.run(host='0.0.0.0', port=5163, debug=False)
|
|
|
|
# res = process_and_embed("土豆")
|
|
|
|
# print(res)
|