first commit
This commit is contained in:
commit
8c33ecc8ca
|
@ -0,0 +1,10 @@
|
||||||
|
FROM python:3.7.13-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
ADD ./requirements.txt /app/
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir
|
||||||
|
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir
|
||||||
|
|
||||||
|
ADD . /app/
|
||||||
|
CMD ["python3", "run.py"]
|
|
@ -0,0 +1,4 @@
|
||||||
|
import os
|
||||||
|
os.environ['TF_KERAS'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
|
@ -0,0 +1,63 @@
|
||||||
|
#! -*- coding: utf-8 -*-
|
||||||
|
# NEZHA模型做闲聊任务
|
||||||
|
# 测试脚本
|
||||||
|
# 测试环境:tensorflow 2.5.3 + keras 2.3.1 + bert4keras 0.11
|
||||||
|
import os
|
||||||
|
os.environ['TF_KERAS'] = "1"
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from bert4keras.backend import keras, K
|
||||||
|
from bert4keras.models import build_transformer_model
|
||||||
|
from bert4keras.tokenizers import Tokenizer
|
||||||
|
from bert4keras.snippets import AutoRegressiveDecoder
|
||||||
|
|
||||||
|
|
||||||
|
class ChatBot(AutoRegressiveDecoder):
|
||||||
|
"""基于随机采样对话机器人
|
||||||
|
"""
|
||||||
|
def __init__(self, start_id, end_id, maxlen, model, tokenizer):
|
||||||
|
super().__init__(start_id, end_id, maxlen)
|
||||||
|
self.model = model
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
|
@AutoRegressiveDecoder.wraps(default_rtype='probas')
|
||||||
|
def predict(self, inputs, output_ids, states):
|
||||||
|
token_ids, segment_ids = inputs
|
||||||
|
token_ids = np.concatenate([token_ids, output_ids], 1)
|
||||||
|
curr_segment_ids = np.ones_like(output_ids) - segment_ids[0, -1]
|
||||||
|
segment_ids = np.concatenate([segment_ids, curr_segment_ids], 1)
|
||||||
|
return self.model.predict([token_ids, segment_ids])[:, -1]
|
||||||
|
|
||||||
|
def response(self, texts, topk=5):
|
||||||
|
token_ids, segment_ids = [self.tokenizer._token_start_id], [0]
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
ids = self.tokenizer.encode(text)[0][1:]
|
||||||
|
token_ids.extend(ids)
|
||||||
|
segment_ids.extend([i % 2] * len(ids))
|
||||||
|
results = self.random_sample([token_ids, segment_ids], 1, topk)
|
||||||
|
return self.tokenizer.decode(results[0])
|
||||||
|
|
||||||
|
|
||||||
|
def build_chat_model(model_path, tokenizer):
|
||||||
|
# nezha配置
|
||||||
|
config_path = f'{model_path}config.json'
|
||||||
|
checkpoint_path = f'{model_path}model.ckpt'
|
||||||
|
|
||||||
|
# 建立并加载模型
|
||||||
|
model = build_transformer_model(
|
||||||
|
config_path,
|
||||||
|
checkpoint_path,
|
||||||
|
model='nezha',
|
||||||
|
application='lm',
|
||||||
|
)
|
||||||
|
chatbot = ChatBot(start_id=None, end_id=tokenizer._token_end_id, maxlen=32, model=model, tokenizer=tokenizer)
|
||||||
|
return chatbot
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
tokenizer = Tokenizer("../models/nezha_gpt_dialog/vocab.txt", do_lower_case=True)
|
||||||
|
chatbot = build_chat_model('../models/nezha_gpt_dialog', tokenizer)
|
||||||
|
text_list = ['绿遍山原白满川', '子规声里雨如烟']
|
||||||
|
print(chatbot.response(text_list))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
import hanlp
|
||||||
|
from logzero import logger
|
||||||
|
from hanlp_common.document import Document
|
||||||
|
tok = hanlp.load('./.hanlp/tok/coarse_electra_small_20220616_012050/')
|
||||||
|
dep = hanlp.load('./.hanlp/dep/ctb9_dep_electra_small_20220216_100306/')
|
||||||
|
sts = hanlp.load('./.hanlp/sts/sts_electra_base_zh_20210530_200109/')
|
||||||
|
|
||||||
|
|
||||||
|
def text_analysis(text):
|
||||||
|
segments = tok(text)
|
||||||
|
logger.info(segments)
|
||||||
|
doc = Document(
|
||||||
|
tok=segments,
|
||||||
|
dep=dep(segments, conll=False),
|
||||||
|
)
|
||||||
|
rst = doc.to_pretty()
|
||||||
|
logger.info(rst)
|
||||||
|
return rst
|
||||||
|
|
||||||
|
def text_simi(src, tgt):
|
||||||
|
score = sts([(src, tgt)])[0]
|
||||||
|
result = ["negative", "positive"][round(score)]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(text_analysis("台湾省是中国不可分割的一部分。"))
|
||||||
|
|
|
@ -0,0 +1,228 @@
|
||||||
|
#! -*- coding: utf-8 -*-
|
||||||
|
# 10个epoch后在valid上能达到约0.77的分数
|
||||||
|
# (Accuracy=0.7282149325820084 F1=0.8207266829447049 Final=0.7744708077633566)
|
||||||
|
|
||||||
|
import json, os, re
|
||||||
|
os.environ['TF_KERAS'] = '1'
|
||||||
|
import numpy as np
|
||||||
|
from bert4keras.backend import keras, K
|
||||||
|
from bert4keras.models import build_transformer_model
|
||||||
|
from bert4keras.tokenizers import Tokenizer, load_vocab
|
||||||
|
from bert4keras.optimizers import Adam
|
||||||
|
from bert4keras.snippets import sequence_padding, DataGenerator
|
||||||
|
from bert4keras.snippets import open
|
||||||
|
from keras.layers import Lambda
|
||||||
|
from keras.models import Model
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
config = tf.compat.v1.ConfigProto()
|
||||||
|
config.gpu_options.allow_growth=True # 按需分配显存
|
||||||
|
tf_session = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(),config=config)
|
||||||
|
tf.compat.v1.keras.backend.set_session(tf_session)
|
||||||
|
|
||||||
|
max_p_len = 256
|
||||||
|
max_q_len = 64
|
||||||
|
max_a_len = 32
|
||||||
|
batch_size = 32
|
||||||
|
epochs = 10
|
||||||
|
|
||||||
|
# # bert配置
|
||||||
|
# config_path = '../models/nezha_gpt/config.json'
|
||||||
|
# checkpoint_path = '../models/nezha_gpt/gpt.ckpt'
|
||||||
|
# dict_path = '../models/tokenizer/vocab.txt'
|
||||||
|
|
||||||
|
# # 标注数据
|
||||||
|
# webqa_data = json.load(open('../data/qa/WebQA.json'))
|
||||||
|
# sogou_data = json.load(open('../data/qa/SogouQA.json'))
|
||||||
|
|
||||||
|
# # 保存一个随机序(供划分valid用)
|
||||||
|
# if not os.path.exists('../random_order.json'):
|
||||||
|
# random_order = list(range(len(sogou_data)))
|
||||||
|
# np.random.shuffle(random_order)
|
||||||
|
# json.dump(random_order, open('../random_order.json', 'w'), indent=4)
|
||||||
|
# else:
|
||||||
|
# random_order = json.load(open('../random_order.json'))
|
||||||
|
|
||||||
|
# # 划分valid
|
||||||
|
# train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
|
||||||
|
# valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
|
||||||
|
# train_data.extend(train_data)
|
||||||
|
# train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合
|
||||||
|
|
||||||
|
# # 加载并精简词表,建立分词器
|
||||||
|
# token_dict, keep_tokens = load_vocab(
|
||||||
|
# dict_path=dict_path,
|
||||||
|
# simplified=True,
|
||||||
|
# startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
|
||||||
|
# )
|
||||||
|
# tokenizer = Tokenizer(token_dict, do_lower_case=True)
|
||||||
|
|
||||||
|
|
||||||
|
# class data_generator(DataGenerator):
|
||||||
|
# """数据生成器
|
||||||
|
# """
|
||||||
|
# def __iter__(self, random=False):
|
||||||
|
# """单条样本格式为
|
||||||
|
# 输入:[CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP]
|
||||||
|
# 输出:答案
|
||||||
|
# """
|
||||||
|
# batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
|
||||||
|
# for is_end, D in self.sample(random):
|
||||||
|
# question = D['question']
|
||||||
|
# answers = [p['answer'] for p in D['passages'] if p['answer']]
|
||||||
|
# passage = np.random.choice(D['passages'])['passage']
|
||||||
|
# passage = re.sub(u' |、|;|,', ',', passage)
|
||||||
|
# final_answer = ''
|
||||||
|
# for answer in answers:
|
||||||
|
# if all([
|
||||||
|
# a in passage[:max_p_len - 2] for a in answer.split(' ')
|
||||||
|
# ]):
|
||||||
|
# final_answer = answer.replace(' ', ',')
|
||||||
|
# break
|
||||||
|
# a_token_ids, _ = tokenizer.encode(
|
||||||
|
# final_answer, maxlen=max_a_len + 1
|
||||||
|
# )
|
||||||
|
# q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
|
||||||
|
# p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
|
||||||
|
# token_ids = [tokenizer._token_start_id]
|
||||||
|
# token_ids += ([tokenizer._token_mask_id] * max_a_len)
|
||||||
|
# token_ids += [tokenizer._token_end_id]
|
||||||
|
# token_ids += (q_token_ids[1:] + p_token_ids[1:])
|
||||||
|
# segment_ids = [0] * len(token_ids)
|
||||||
|
# batch_token_ids.append(token_ids)
|
||||||
|
# batch_segment_ids.append(segment_ids)
|
||||||
|
# batch_a_token_ids.append(a_token_ids[1:])
|
||||||
|
# if len(batch_token_ids) == self.batch_size or is_end:
|
||||||
|
# batch_token_ids = sequence_padding(batch_token_ids)
|
||||||
|
# batch_segment_ids = sequence_padding(batch_segment_ids)
|
||||||
|
# batch_a_token_ids = sequence_padding(
|
||||||
|
# batch_a_token_ids, max_a_len
|
||||||
|
# )
|
||||||
|
# yield [batch_token_ids, batch_segment_ids], batch_a_token_ids
|
||||||
|
# batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
|
||||||
|
|
||||||
|
|
||||||
|
def masked_cross_entropy(y_true, y_pred):
|
||||||
|
"""交叉熵作为loss,并mask掉padding部分的预测
|
||||||
|
"""
|
||||||
|
y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
|
||||||
|
y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
|
||||||
|
cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
|
||||||
|
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
|
||||||
|
return cross_entropy
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def build_reading_model(config_path:str, ckpt_path:str, keep_tokens:str, weight_path:str):
|
||||||
|
model = build_transformer_model(
|
||||||
|
config_path,
|
||||||
|
ckpt_path,
|
||||||
|
with_mlm=True,
|
||||||
|
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
|
||||||
|
)
|
||||||
|
output = Lambda(lambda x: x[:, 1:max_a_len + 1])(model.output)
|
||||||
|
model = Model(model.input, output)
|
||||||
|
model.compile(loss=masked_cross_entropy, optimizer=Adam(1e-5))
|
||||||
|
model.load_weights(weight_path)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def get_ngram_set(x, n):
|
||||||
|
"""生成ngram合集,返回结果格式是:
|
||||||
|
{(n-1)-gram: set([n-gram的第n个字集合])}
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
for i in range(len(x) - n + 1):
|
||||||
|
k = tuple(x[i:i + n])
|
||||||
|
if k[:-1] not in result:
|
||||||
|
result[k[:-1]] = set()
|
||||||
|
result[k[:-1]].add(k[-1])
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def gen_answer(question, passages, model, tokenizer):
|
||||||
|
"""由于是MLM模型,所以可以直接argmax解码。
|
||||||
|
"""
|
||||||
|
all_p_token_ids, token_ids, segment_ids = [], [], []
|
||||||
|
for passage in passages:
|
||||||
|
passage = re.sub(u' |、|;|,', ',', passage)
|
||||||
|
p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
|
||||||
|
q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
|
||||||
|
all_p_token_ids.append(p_token_ids[1:])
|
||||||
|
token_ids.append([tokenizer._token_start_id])
|
||||||
|
token_ids[-1] += ([tokenizer._token_mask_id] * max_a_len)
|
||||||
|
token_ids[-1] += [tokenizer._token_end_id]
|
||||||
|
token_ids[-1] += (q_token_ids[1:] + p_token_ids[1:])
|
||||||
|
segment_ids.append([0] * len(token_ids[-1]))
|
||||||
|
token_ids = sequence_padding(token_ids)
|
||||||
|
segment_ids = sequence_padding(segment_ids)
|
||||||
|
probas = model.predict([token_ids, segment_ids])
|
||||||
|
results = {}
|
||||||
|
for t, p in zip(all_p_token_ids, probas):
|
||||||
|
a, score = tuple(), 0.
|
||||||
|
for i in range(max_a_len):
|
||||||
|
idxs = list(get_ngram_set(t, i + 1)[a])
|
||||||
|
if tokenizer._token_end_id not in idxs:
|
||||||
|
idxs.append(tokenizer._token_end_id)
|
||||||
|
# pi是将passage以外的token的概率置零
|
||||||
|
pi = np.zeros_like(p[i])
|
||||||
|
pi[idxs] = p[i, idxs]
|
||||||
|
a = a + (pi.argmax(),)
|
||||||
|
score += pi.max()
|
||||||
|
if a[-1] == tokenizer._token_end_id:
|
||||||
|
break
|
||||||
|
score = score / (i + 1)
|
||||||
|
a = tokenizer.decode(a)
|
||||||
|
if a:
|
||||||
|
results[a] = results.get(a, []) + [score]
|
||||||
|
results = {
|
||||||
|
k: (np.array(v)**2).sum() / (sum(v) + 1)
|
||||||
|
for k, v in results.items()
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def max_in_dict(d):
|
||||||
|
if d:
|
||||||
|
return sorted(d.items(), key=lambda s: -s[1])[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
# def predict_to_file(data, filename):
|
||||||
|
# """将预测结果输出到文件,方便评估
|
||||||
|
# """
|
||||||
|
# with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
# for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
|
||||||
|
# q_text = d['question']
|
||||||
|
# p_texts = [p['passage'] for p in d['passages']]
|
||||||
|
# a = gen_answer(q_text, p_texts)
|
||||||
|
# a = max_in_dict(a)
|
||||||
|
# if a:
|
||||||
|
# s = u'%s\t%s\n' % (d['id'], a)
|
||||||
|
# else:
|
||||||
|
# s = u'%s\t\n' % (d['id'])
|
||||||
|
# f.write(s)
|
||||||
|
# f.flush()
|
||||||
|
|
||||||
|
|
||||||
|
# class Evaluator(keras.callbacks.Callback):
|
||||||
|
# """评估与保存
|
||||||
|
# """
|
||||||
|
# def __init__(self):
|
||||||
|
# self.lowest = 1e10
|
||||||
|
|
||||||
|
# def on_epoch_end(self, epoch, logs=None):
|
||||||
|
# # 保存最优
|
||||||
|
# if logs['loss'] <= self.lowest:
|
||||||
|
# self.lowest = logs['loss']
|
||||||
|
# model.save_weights('../models/qa/best_model.weights')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model = build_reading_model()
|
||||||
|
model.load_weights('../models/qa/best_model.weights')
|
||||||
|
questions = "嬴政出生在哪里?"
|
||||||
|
passages = ["秦始皇嬴政(前259年—前210年),嬴姓,赵氏 ,名政(一说名“正”),又称赵政 、祖龙 ,也有吕政一说(详见“人物争议-姓名之争”目录)。秦庄襄王和赵姬之子。中国古代杰出的政治家、战略家、改革家,首次完成中国大一统的政治人物,也是中国第一个称皇帝的君主。",
|
||||||
|
"公元前221年,秦统一六国之后,秦王嬴政认为自己“德兼三皇,功过五帝”,遂采用三皇之“皇”、五帝之“帝”构成“皇帝”的称号,是中国历史上第一个使用“皇帝”称号的君主,所以自称“始皇帝”。",
|
||||||
|
"秦始皇有二十余子。长子扶苏,少子胡亥。",
|
||||||
|
"嬴政出生在当时赵国的邯郸廓城(在今城内中街以东,丛台西南的朱家巷一带),是当时的秦国王孙异人之子。"]
|
||||||
|
print(gen_answer(questions, passages))
|
|
@ -0,0 +1,200 @@
|
||||||
|
#! -*- coding:utf-8 -*-
|
||||||
|
import os
|
||||||
|
|
||||||
|
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #(保证程序cuda序号与实际cuda序号对应)
|
||||||
|
# os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1" #(代表仅使用第0,1号GPU)
|
||||||
|
os.environ['TF_KERAS'] ='1'
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import tensorflow as tf
|
||||||
|
from logzero import logger
|
||||||
|
from bert4keras.backend import keras, set_gelu
|
||||||
|
from bert4keras.tokenizers import Tokenizer
|
||||||
|
from bert4keras.models import build_transformer_model
|
||||||
|
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
|
||||||
|
from bert4keras.snippets import sequence_padding, DataGenerator
|
||||||
|
from bert4keras.snippets import open
|
||||||
|
from keras.layers import Lambda, Dense
|
||||||
|
|
||||||
|
config = tf.compat.v1.ConfigProto()
|
||||||
|
config.gpu_options.allow_growth=True # 按需分配显存
|
||||||
|
tf_session = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(),config=config)
|
||||||
|
tf.compat.v1.keras.backend.set_session(tf_session)
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
set_gelu('tanh') # 切换gelu版本
|
||||||
|
|
||||||
|
MAX_LEN = 128
|
||||||
|
BATCH_SIZE = 32
|
||||||
|
AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
|
||||||
|
|
||||||
|
# config_path = '/home/zhaojh/pretrain_models/roberta_base/bert_config.json'
|
||||||
|
# checkpoint_path = '/home/zhaojh/pretrain_models/roberta_base/bert_model.ckpt'
|
||||||
|
# dict_path = '/home/zhaojh/pretrain_models/roberta_base/vocab.txt'
|
||||||
|
# 建立分词器
|
||||||
|
# tokenizer = Tokenizer(dict_path, do_lower_case=True)
|
||||||
|
|
||||||
|
# def label_to_digit(data_df: pd.DataFrame, labels):
|
||||||
|
# data = list()
|
||||||
|
# for i in range(data_df.shape[0]):
|
||||||
|
# data.append((data_df.iloc[i]['text'], int(labels.index(data_df.iloc[i]['label']))))
|
||||||
|
# return data
|
||||||
|
|
||||||
|
# def load_data(all_data:pd.DataFrame):
|
||||||
|
# """加载数据
|
||||||
|
# 单条格式:(文本, 标签id)
|
||||||
|
# """
|
||||||
|
# logger.info(f"数据集大小:{all_data.shape}")
|
||||||
|
# assert ('text' in all_data.columns and 'label' in all_data.columns)
|
||||||
|
# use_data = all_data[['text', 'label']].copy()
|
||||||
|
# labels = use_data.label.unique().tolist()
|
||||||
|
# train, valid = train_test_split(use_data, test_size=0.3, random_state=42, shuffle=True)
|
||||||
|
# valid, test = train_test_split(valid, test_size=0.7, random_state=42, shuffle=True)
|
||||||
|
# # 转换数据集
|
||||||
|
# train_data = label_to_digit(train, labels)
|
||||||
|
# valid_data = label_to_digit(valid, labels)
|
||||||
|
# test_data = label_to_digit(test, labels)
|
||||||
|
# train_generator = MyDataGenerator(train_data, BATCH_SIZE)
|
||||||
|
# valid_generator = MyDataGenerator(valid_data, BATCH_SIZE)
|
||||||
|
# test_generator = MyDataGenerator(test_data, BATCH_SIZE)
|
||||||
|
# return train_generator, valid_generator, test_generator, labels
|
||||||
|
|
||||||
|
|
||||||
|
# class MyDataGenerator(DataGenerator):
|
||||||
|
# """数据生成器
|
||||||
|
# """
|
||||||
|
# def __iter__(self, random=False):
|
||||||
|
# batch_token_ids, batch_segment_ids, batch_labels = [], [], []
|
||||||
|
# for is_end, (text, label) in self.sample(random):
|
||||||
|
# token_ids, segment_ids = tokenizer.encode(text, maxlen=MAX_LEN)
|
||||||
|
# batch_token_ids.append(token_ids)
|
||||||
|
# batch_segment_ids.append(segment_ids)
|
||||||
|
# batch_labels.append([label])
|
||||||
|
# if len(batch_token_ids) == self.batch_size or is_end:
|
||||||
|
# batch_token_ids = sequence_padding(batch_token_ids)
|
||||||
|
# batch_segment_ids = sequence_padding(batch_segment_ids)
|
||||||
|
# batch_labels = sequence_padding(batch_labels)
|
||||||
|
# yield [batch_token_ids, batch_segment_ids], batch_labels
|
||||||
|
# batch_token_ids, batch_segment_ids, batch_labels = [], [], []
|
||||||
|
|
||||||
|
|
||||||
|
def build_model(config_path, ckpt_path, num_classes, lr):
|
||||||
|
# 加载预训练模型
|
||||||
|
bert = build_transformer_model(
|
||||||
|
config_path=config_path,
|
||||||
|
checkpoint_path=ckpt_path,
|
||||||
|
model='bert',
|
||||||
|
return_keras_model=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
|
||||||
|
if num_classes == 2:
|
||||||
|
activitor = 'sigmoid'
|
||||||
|
else:
|
||||||
|
activitor = 'softmax'
|
||||||
|
output = Dense(
|
||||||
|
units=num_classes,
|
||||||
|
activation=activitor,
|
||||||
|
kernel_initializer=bert.initializer
|
||||||
|
)(output)
|
||||||
|
|
||||||
|
model = keras.models.Model(bert.model.input, output)
|
||||||
|
|
||||||
|
# 派生为带分段线性学习率的优化器。
|
||||||
|
AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
|
||||||
|
|
||||||
|
model.compile(
|
||||||
|
loss='sparse_categorical_crossentropy',
|
||||||
|
optimizer=AdamLR(learning_rate=lr, lr_schedule={
|
||||||
|
1000: 1,
|
||||||
|
2000: 0.1
|
||||||
|
}),
|
||||||
|
metrics=['accuracy'],
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(data, model):
|
||||||
|
total, right = 0., 0.
|
||||||
|
for x_true, y_true in data:
|
||||||
|
y_pred = model.predict(x_true).argmax(axis=1)
|
||||||
|
y_true = y_true[:, 0]
|
||||||
|
total += len(y_true)
|
||||||
|
right += (y_true == y_pred).sum()
|
||||||
|
return right / total
|
||||||
|
|
||||||
|
|
||||||
|
# class Evaluator(keras.callbacks.Callback):
|
||||||
|
# """评估与保存
|
||||||
|
# """
|
||||||
|
# def __init__(self, model, valid_generator, test_generator):
|
||||||
|
# super().__init__()
|
||||||
|
# self.best_val_acc = 0.
|
||||||
|
# self.model = model
|
||||||
|
# self.valid = valid_generator
|
||||||
|
# self.test = test_generator
|
||||||
|
|
||||||
|
# def on_epoch_end(self, epoch, logs=None):
|
||||||
|
# val_acc = evaluate(self.valid, self.model)
|
||||||
|
# if val_acc > self.best_val_acc:
|
||||||
|
# self.best_val_acc = val_acc
|
||||||
|
# self.model.save_weights('../models/text_classifier/best_model.weights')
|
||||||
|
# test_acc = evaluate(self.test, self.model)
|
||||||
|
# logger.info(f"val_acc: {val_acc}, best_val_acc:{self.best_val_acc}, test_acc: {test_acc}")
|
||||||
|
|
||||||
|
|
||||||
|
# def train_model(train_generator, valid_generator, test_generator, num_classes, epochs, lr):
|
||||||
|
# model = build_model(num_classes, lr)
|
||||||
|
# evaluator = Evaluator(model, valid_generator, test_generator)
|
||||||
|
|
||||||
|
# model.fit(
|
||||||
|
# train_generator.forfit(),
|
||||||
|
# steps_per_epoch=len(train_generator),
|
||||||
|
# epochs=epochs,
|
||||||
|
# callbacks=[evaluator]
|
||||||
|
# )
|
||||||
|
# model.load_weights('../models/text_classifier/best_model.weights')
|
||||||
|
# return model
|
||||||
|
|
||||||
|
|
||||||
|
def run_cls(model, tokenizer, test_text:str, labels=["negative", "positive"]):
|
||||||
|
token_ids, segment_ids = tokenizer.encode(test_text, maxlen=MAX_LEN)
|
||||||
|
tok_ids = sequence_padding([token_ids])
|
||||||
|
seg_ids = sequence_padding([segment_ids])
|
||||||
|
predicted_data = model.predict([tok_ids, seg_ids])[0]
|
||||||
|
return labels[np.argmax(predicted_data)]
|
||||||
|
|
||||||
|
|
||||||
|
# def train(df:pd.DataFrame, epochs:int=20, lr:float=1e-3):
|
||||||
|
# train, valid, test, labels = load_data(df)
|
||||||
|
# model = train_model(train, valid, test, len(labels), epochs, lr)
|
||||||
|
# # score = evaluate(test.forfit(), model)
|
||||||
|
# model.save('../models/text_classifier/best_model.h5')
|
||||||
|
# return model, labels
|
||||||
|
|
||||||
|
def load_cls_model(config_path:str, ckpt_path:str, weight_path:str):
|
||||||
|
"""load local classification model
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_path (str): config.json
|
||||||
|
ckpt_path (str): ckpt
|
||||||
|
weight_path (str): best weight
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
keras.Model: pretrained model
|
||||||
|
"""
|
||||||
|
model = build_model(config_path, ckpt_path, 2, 0.001)
|
||||||
|
model.load_weights(weight_path)
|
||||||
|
return model
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from keras.models import load_model
|
||||||
|
dict_path = '../models/tokenizer/vocab.txt'
|
||||||
|
tokenizer = Tokenizer(dict_path, do_lower_case=True)
|
||||||
|
model = load_model('../models/text_classifier/best_model.h5')
|
||||||
|
rst = run_cls(model, tokenizer, "这部电影太棒了")
|
||||||
|
print(rst)
|
|
@ -0,0 +1,54 @@
|
||||||
|
#! -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
os.environ["TF_KERAS"] = "1"
|
||||||
|
import numpy as np
|
||||||
|
from bert4keras.models import build_transformer_model
|
||||||
|
from bert4keras.tokenizers import Tokenizer
|
||||||
|
from bert4keras.snippets import AutoRegressiveDecoder
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleCompletion(AutoRegressiveDecoder):
|
||||||
|
"""基于随机采样的文章续写
|
||||||
|
"""
|
||||||
|
def __init__(self, start_id, end_id, maxlen, minlen, config_path, ckpt_path):
|
||||||
|
super().__init__(start_id, end_id, maxlen, minlen)
|
||||||
|
self.model = build_transformer_model(
|
||||||
|
config_path=config_path,
|
||||||
|
checkpoint_path=ckpt_path,
|
||||||
|
segment_vocab_size=0,
|
||||||
|
application='lm',
|
||||||
|
)
|
||||||
|
|
||||||
|
@AutoRegressiveDecoder.wraps(default_rtype='probas')
|
||||||
|
def predict(self, inputs, output_ids, states):
|
||||||
|
token_ids = np.concatenate([inputs[0], output_ids], 1)
|
||||||
|
return self.last_token(self.model).predict(token_ids)
|
||||||
|
|
||||||
|
def generate(self, text, tokenizer, n=1, topp=0.95):
|
||||||
|
"""根据输入文本生成文本
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): 输入文本
|
||||||
|
tokenizer (Tokenizer): 分词工具
|
||||||
|
n (int, optional): 文本数. Defaults to 1.
|
||||||
|
topp (float, optional): 置信度. Defaults to 0.95.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: 生成的文本
|
||||||
|
"""
|
||||||
|
token_ids = tokenizer.encode(text)[0][:-1]
|
||||||
|
results = self.random_sample([token_ids], n, topp=topp) # 基于随机采样
|
||||||
|
return [text + tokenizer.decode(ids) for ids in results]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
article_completion = ArticleCompletion(
|
||||||
|
start_id=None,
|
||||||
|
end_id=511, # 511是中文句号
|
||||||
|
maxlen=256,
|
||||||
|
minlen=128,
|
||||||
|
config_path='../models/nezha_gpt/config.json',
|
||||||
|
ckpt_path="../models/nezha_gpt/gpt.ckpt"
|
||||||
|
)
|
||||||
|
tokenizer = Tokenizer(f"../models/tokenizer/vocab.txt", do_lower_case=True)
|
||||||
|
print(article_completion.generate(u'中国科学院青岛生物能源与过程研究所泛能源大数据与战略研究中心', tokenizer))
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,13 @@
|
||||||
|
Flask==2.1.0
|
||||||
|
protobuf==3.19.4
|
||||||
|
bert4keras==0.11.3
|
||||||
|
numpy~=1.19.2
|
||||||
|
hanlp==2.1.0b39
|
||||||
|
hanlp_common==0.0.18
|
||||||
|
Keras==2.3.1
|
||||||
|
logzero==1.7.0
|
||||||
|
pandas==1.3.5
|
||||||
|
scikit_learn==1.0.2
|
||||||
|
tensorflow==2.5.3
|
||||||
|
tqdm==4.64.0
|
||||||
|
librosa==0.9.2
|
|
@ -0,0 +1,193 @@
|
||||||
|
# -*-coding:utf-8-*-
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from flask import Flask, request, make_response
|
||||||
|
from logzero import logger
|
||||||
|
# current_path = os.path.dirname(os.path.abspath(__file__)) # for local
|
||||||
|
current_path = "/app" # for docker
|
||||||
|
logger.info(f"{current_path}")
|
||||||
|
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
sys.path.append(f"{current_path}/nlp/")
|
||||||
|
os.environ["TF_KERAS"] = "1"
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
||||||
|
|
||||||
|
|
||||||
|
from bert4keras.tokenizers import Tokenizer, load_vocab
|
||||||
|
from keras.models import load_model
|
||||||
|
|
||||||
|
from nlp.text_gen import ArticleCompletion
|
||||||
|
from nlp.chat import build_chat_model
|
||||||
|
from nlp.text_classification import run_cls as run_class, AdamLR
|
||||||
|
# from utils.translate import load_model as load_translator, run_test as run_translator
|
||||||
|
from nlp.hanlp_tools import text_analysis, text_simi
|
||||||
|
from nlp.reading import build_reading_model, gen_answer
|
||||||
|
|
||||||
|
|
||||||
|
general_tokenizer = Tokenizer(f"{current_path}/models/tokenizer/vocab.txt", do_lower_case=True) # 通用分词器
|
||||||
|
dialog_tokenizer = Tokenizer(f"{current_path}/models/nezha_gpt_dialog/vocab.txt", do_lower_case=True) # 对话分词器
|
||||||
|
|
||||||
|
token_dict, keep_tokens = load_vocab(
|
||||||
|
# 加载并精简词表,建立阅读理解词表
|
||||||
|
dict_path=f"{current_path}/models/tokenizer/vocab.txt",
|
||||||
|
simplified=True,
|
||||||
|
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
|
||||||
|
)
|
||||||
|
reading_tokenizer = Tokenizer(token_dict, do_lower_case=True) # 阅读理解分词器
|
||||||
|
cls_model = load_model(f"{current_path}/models/text_classifier/best_model.h5") # 加载分类模型
|
||||||
|
gen_model = ArticleCompletion(
|
||||||
|
# 加载文本生成模型
|
||||||
|
start_id=None,
|
||||||
|
end_id=511, # 511是中文句号
|
||||||
|
maxlen=256,
|
||||||
|
minlen=128,
|
||||||
|
config_path=f"{current_path}/models/nezha_gpt/config.json",
|
||||||
|
ckpt_path=f"{current_path}/models/nezha_gpt/gpt.ckpt"
|
||||||
|
)
|
||||||
|
chatbot = build_chat_model(
|
||||||
|
# 加载对话模型
|
||||||
|
f"{current_path}/models/nezha_gpt_dialog/",
|
||||||
|
dialog_tokenizer)
|
||||||
|
# translator, trans_data = load_translator(
|
||||||
|
# # 加载翻译模型
|
||||||
|
# f"{current_path}/models/translator/translation.h5",
|
||||||
|
# f"{current_path}/data/translator/train.txt",
|
||||||
|
# f"{current_path}/data/translator/dev.txt"
|
||||||
|
# )
|
||||||
|
reading_model = build_reading_model(
|
||||||
|
# 加载阅读理解模型
|
||||||
|
f"{current_path}/models/nezha_gpt/config.json",
|
||||||
|
f"{current_path}/models/nezha_gpt/gpt.ckpt",
|
||||||
|
keep_tokens,
|
||||||
|
f"{current_path}/models/qa/best_model.weights"
|
||||||
|
)
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route('/text_cls/', methods=["POST"])
|
||||||
|
def run_cls():
|
||||||
|
resp = make_response()
|
||||||
|
if request.method == "POST":
|
||||||
|
text = request.form.get('text')
|
||||||
|
if text is not None and text != '':
|
||||||
|
resp.response = run_class(cls_model, general_tokenizer, text)
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code = 406
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code=405
|
||||||
|
return resp
|
||||||
|
|
||||||
|
@app.route('/text_gen/', methods=["POST"])
|
||||||
|
def run_gen():
|
||||||
|
resp = make_response()
|
||||||
|
if request.method == "POST":
|
||||||
|
text = request.form.get('text')
|
||||||
|
logger.info(f"将对文本'{text}'进行续写")
|
||||||
|
if text != "":
|
||||||
|
rest = gen_model.generate(text, general_tokenizer)
|
||||||
|
logger.info(rest)
|
||||||
|
resp.response = rest
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code = 406
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code=405
|
||||||
|
return resp
|
||||||
|
|
||||||
|
@app.route('/chat/', methods=["POST"])
|
||||||
|
def run_chat():
|
||||||
|
# todo: 这个模块可以用grpc流式服务做。
|
||||||
|
# 如果用flask 就把历史对话都按照list的方式传进来
|
||||||
|
# 历史对话可以用json传
|
||||||
|
resp = make_response()
|
||||||
|
if request.method == "POST":
|
||||||
|
dialog_history = request.form.get("dialog_history")
|
||||||
|
dialog_history = dialog_history.split('。')
|
||||||
|
logger.info(f"将对文本'{dialog_history}'进行对话")
|
||||||
|
if len(dialog_history) > 0:
|
||||||
|
rest = chatbot.response(dialog_history)
|
||||||
|
logger.info(rest)
|
||||||
|
resp.response = rest
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code = 406
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code=405
|
||||||
|
return resp
|
||||||
|
|
||||||
|
# @app.route('/translate/', methods=["POST"])
|
||||||
|
# def run_translate():
|
||||||
|
# resp = make_response()
|
||||||
|
# if request.method == "POST":
|
||||||
|
# text = request.json.get('text')
|
||||||
|
# if text is None or text.strip() == "":
|
||||||
|
# resp.status_code = 406
|
||||||
|
# return resp
|
||||||
|
# rest = run_translator(text, translator, trans_data)
|
||||||
|
# resp.status_code = 200
|
||||||
|
# resp.response = rest
|
||||||
|
# return resp
|
||||||
|
# else:
|
||||||
|
# resp.status_code=405
|
||||||
|
# return resp
|
||||||
|
|
||||||
|
@app.route('/simi/', methods=["POST"])
|
||||||
|
def run_match():
|
||||||
|
resp = make_response()
|
||||||
|
if request.method == "POST":
|
||||||
|
src = request.form.get('text_1')
|
||||||
|
tgt = request.form.get('text_2')
|
||||||
|
resp.response = str(text_simi(src, tgt))
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code=405
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/dependency/', methods=["POST"])
|
||||||
|
def run_depend():
|
||||||
|
resp = make_response()
|
||||||
|
if request.method == "POST":
|
||||||
|
text = request.form.get('text')
|
||||||
|
if text is None or text.strip() == "":
|
||||||
|
resp.status_code=406
|
||||||
|
return resp
|
||||||
|
resp.response = str(text_analysis(text))
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code=405
|
||||||
|
return resp
|
||||||
|
|
||||||
|
@app.route('/reading/', methods=["POST"])
|
||||||
|
def run_reading():
|
||||||
|
resp = make_response()
|
||||||
|
if request.method == "POST":
|
||||||
|
question = request.form.get("question")
|
||||||
|
passages = request.form.get("passages")
|
||||||
|
passages = [x + '。' for x in passages.split('。')]
|
||||||
|
if len(passages) == 0 or question is None or question.strip() == "":
|
||||||
|
resp.status_code=406
|
||||||
|
return resp
|
||||||
|
resp.response = json.dumps(gen_answer(question, passages, reading_model, reading_tokenizer))
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
resp.status_code=405
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(host='0.0.0.0', port=8903, debug=True)
|
Loading…
Reference in New Issue