841 lines
916 KiB
Plaintext
841 lines
916 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"attachments": {
|
|||
|
"image.png": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzYAAABCCAYAAACFFEMLAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAACp4SURBVHhe7d0HXBTX2gfgP71LL4KIFAFBsQtqFNGosUSjscWo15ZETXI1URNNrFhjYqLez5ZiLDEm9m5iQ1GDBHtv2CJVqrRd2N33m92dLSAoKojc+z6/jNlpy86ZM+e8Z+bMjAEJwBhjjDHGGGPVmKH4f8YYY4wxxhirtrhhwxhjjDHGGKv2uGHDGGOMMcYYq/a4YcMYY4wxxhir9rhhwxhjjDHGGKv2uGHDGGOMMcYYq/a4YcMYY4wxxhir9rhhwxhjjDHGGKv2uGHDGGOMMcYYq/a4YcMYY4wxxhir9rhhwxhjjDHGGKv2uGHDGGOMMcYYq/a4YcMYY4wxxhir9gxIIH5+IVKpFKmpqeIYY4wxxhhjjFUMFxcXmJmZiWOlq7CGzZEjRxARESGOMcYYY4wxxljFULY1wsPDxbHSVVjD5saNG1i4cKE4xhhjjDHGGGMVY8KECahbt644VroKa9gwxhhjjDHGWFXhhwcwxhhjjDHGqj1u2DDGGGOMMcaqPW7YMMYYY4wxxqo9btgwxhhjjDHGqj1u2DDGGGOMMcaqPW7YMMYYY4wxxqo9btgwxhhjjDHGqj1u2DDGGGOMMcaqvVfqBZ2yh2exa8Pv2Bl1GvFJuYCVA9zqNkenXv3Qu0MQrG7/hInrAzFnZmvYiOtUJkXeDexeugzb/r6JB9mFUCiM4RDwGnoNH4X+zRxhJC7330Jy61dMGrcM5yw74cslX6Cjm7E4hzHGnoccqdErsfxAEmTiFH0GxpZwcHOHu3sdBIe1RLCzqTinauVfWYtvN9yEVBw38eyOj0c2QNIv32LDTe1UeHb/GCND7fkMYZnycWWtfpo9jTFqdhyND9q6lKhfC5EatwlLV56A96RFGOpXznxSmIzjq/+DtVGX8E+GBEZ2Xghu1BTNWzeD7dG52NjkB/zQzUFcmFUcGR6e3YUNv+9E1Ol4qMM5N9Rt3gm9+vVGhyAr3P5pItYHzsHM1hUczUlu4ddJ47DsnCU6fbkEX3R0E3JV9VOyDNIxgKGZDZyFctPDuwFatwmBi97hIL21CYvXXUCOQpygz9ABYSM+RLfapRw/+Vew9tsNKPehClP49h+PofXxhGPcELZNh+Ljt7xhpsjCqVVLsONekThP2BJzfwwYPxhB5uKEiqJs2FQ5WRodX/A2+ZuAYFiX3l18iG7nyKjo0T2K2zKTenoZEMzcyMMaZN5pK6WLq1WmgivL6S0P4ffAiiK+PUc5cjmlbutNtkKSAf407tgjccnqSXJ7F60+8ICKxHGiNNrSxULZyFUNnuNOUq44hzHGnpc0/SbFbJpK4UL5rSlfgJrUd/7PtPb7hfT524FkoppmRw16fkFb4gvENauOLPM8rR3mqfu9Id/SrSIZZZ5fS8M8NdsACvn2ll4Zyh4np9x752j/siFUR5OWysGqFX208Cdat2opLZjxOY3p35o8DNTz/KdfIIm4NhWl0pnfZtK7LVzEdX1o8pny5Y+ihO00JsSMULMPLY9LF/aTjHLuHqMVw4LF/AZquS5FXJpVFFnacVrwtr8qjQ3rvkuLD92mHFkRPboXR1tm9iQvYT+buXmQNcyp09aKj+bStnQhC3H/wnMcnaymgYw89x6d27+UBnmJ26IanKjHnA20a8sKmhBhr55Wox71mLqb7kvF9fIS6eLhH2ikv/56IL/hy2n/qZuUqj24SpDn0r1z+2npIK9i61m/9m9auGo9/bJqBS3+agq990agsO/U81qtT1WuSDl3YmnztHba6arB/k36Zu8pupkiEZZQEsrP+FjasbAfeQrzjYLepx+P3KRH6pkVquobNrJk2j3anwxUiVGDuq+++1hFIX8UR/NeM1cllunLaNjI7tKPEabqnWPRlbaJfzBjV09xx5lQxC9J4s6qhh7F0JSGFuRVrPGSRQeHOKm3WRgazL6kq1wYY+yFZNCONy215QscBtHBLHGWPIW297XTzfMYQXtTq750zdzbRzyRJQyqho1qKu3tY6v9rdywKaeCMzTZV0xL5dBiFSWW2MW557+ldhaaho2Erq8ZR2+/3pbC6gqNE8165W3YCHlqWx91nmqx8p4QUul7RCenNiRDYR43bCqWLHk3jfY3UO+rGt1p9d3Hojl6FDePXjNX7kvTSmnYZB0cQk6a/NJgNl2q1oFMAZ2Z7KveFuVgHEEb09Rz5IkbqIulOB2G1GTORWFpDQldmhWoWw8N6Jvr5SupCs5MJl/tesLxsyqxRKwrp4yjn1OwgaHYsBFl7KDu2t8DchpySDjSSiG/Tyub2VDY6pLfW3Gq+Aq6DHfXDMM7y2+oUgK+HyGyv9djlw0NbZph4ubfMcRVnFDZ0mPw6/FC9WcbN9iZqD/ad1qJAxuWYMmvR/FrX7fq2f1AloRtH/fB7PMFKH6l0hbtFu3Dmumf4LNvdmH7+GCYiXMYY+zFGMKyhl5/AwND5X9qhi4I/1eErntxwlp8fSC1RPn08hkYGsFA/KxjAEOjx6eypzAwgZl+3zLl/hc/aliFjMaiCSGwUI2ZoHbP2fj9wFEc3TEd9VTTnkF2LFb/kSV8MISRScn9aIPQST/jC3/LUvYve26yu1gz7B0sv6GK5oRwLhL9vR6L5mDTbCI2/z4ElRXO2bZbhH1rpuOTz77Bru3jEVytAxkDmOgfOMJxYyhmWkPHRmhbR/0ZQml5Zt12xGt7gwnrmeunvRlqWJQvYjUwMSvWDdRA8we1DGHfdhIWvltHOEr1GJrBWq+Hm4W9ZekxsqEVnJ2sYWVlWvr8ClC1sXlWNGZ/uQ854mjtHj0QUEZfOyPXrpg5pyNcy7ixRSHJREaeXBwrDwUkGSnIkDxefUof3kKiphugqTXMNKlk6oqwAR9jTM96MMzORal/TZ6Hh4kpyJJWXrWskGYiNVNaasWvTIek5EyU+udlCdj+cQTeXpMgTijOyL4Zhsz4Fl+N7w6fsvo8KiTIzMgrfdvLoJBkICVDUuWBCmOs6hg8IYo0tnGEpfgZKELK/axS78lRltv5GRnIf2oBJEPuw3Q8sUpQlmVJycisxLKaPV1+3HT0jTwNCcwR0HswWruZCGGZIcxtrVQBlnENF1irFy03eU4iHuQqPykQ89l7mH84CeKpSjXLBhg2/k3425QRUJQn/yhjiMyMpyxTgkKKzNTS62dFfgYynp6xX1lZ0bPx5T5tNIcePQKEPVoaI7h2nYk5HV3LuE+5fOkqy3moi99kecjTFBhG9mg2ZAa+/Wo8upcZyGiUHQcWo8hHRkb+M8U9lc7IAvrnip5GeyLpBaRvG6SKHxWwQ4tBbyKohv4eLPEHnlDgGzypMqgAVdqwyYr5HluSxRGh7ecb6qNXuZVkjDqDfsLG6aF6hZwE8ZsnonN9bwR1GIyhXUPgFRCO0asvam+ckl5bgQENa8HF2RnOwuDeYjKiTq7EkBBbWDi6wdHCHL69FyI2S7mCAkm/9YR/65m4pl4deLAUb3gp13WCvZCLjIQdYmxlD9fXf8Bdbc0rxf0/F2BIqDusnVti8IfvoKmdJdwbtUfX7t3RvfubGDg7ClFT28PXTf07nJ3roNemFChy/sL0bg3g6SJOd62HEQezhZ+SiaiJbeGjXd4X/XfexvG5XVHb3AGuDtZoOO4g0pU/W56OE4tHIqJ+A7zWcyhG9m4IGyMz1Go9FF8f0ty0m4/TMzqj14rr6qtjgn/+0wlezh4Imx6No9Paw0/7tzzx+tIbxSoCSfxmTOxcH95BHTB4aFeEeAUgfPRqXNQlNFYMaIhamu1wb4HJUSexckgIbC0c4eZoAXPf3lgYmyWkMmOM6UgfPoBQ6ons0CDEFcVuby38B7undEVQ7UB0+tcghPt6IGTAYsRm6IcaCuRe/Q2f92yGgJAOeOdfHVHLxAaBb83An0mawlqO9BOLMTKiPhq81hNDR/ZGQyG4NavVGkO/PgTtYuwlkeL2H7twNZdU9YJ5
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. 首先生成几个针对question.txt的结果,每行一个,存储在txt中;\n",
|
|||
|
"2. 选择哪几个模型?\n",
|
|||
|
"\n",
|
|||
|
"- GPT系列\n",
|
|||
|
"- GLM3\n",
|
|||
|
"- 百度\n",
|
|||
|
"- Qwen1.5-72b-chat \n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"https://github.com/yuyouyu32/LLMQAEvaluate?tab=readme-ov-file"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# xlsx文件转化为两个txt\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"data = pd.read_excel(\"/home/zhangxj/WorkFile/LCA-GPT/QA/QA.xlsx\")\n",
|
|||
|
"q = data['question'].values.tolist()\n",
|
|||
|
"a = data['answer'].values.tolist()\n",
|
|||
|
"\n",
|
|||
|
"with open(\"/home/zhangxj/WorkFile/LCA-GPT/QA/filters/question.txt\",\"w\",encoding=\"utf-8\") as file:\n",
|
|||
|
" for item in q:\n",
|
|||
|
" file.write(f\"{item}\\n\")\n",
|
|||
|
"\n",
|
|||
|
"with open(\"/home/zhangxj/WorkFile/LCA-GPT/QA/filters/answers.txt\",\"w\",encoding=\"utf-8\") as file:\n",
|
|||
|
" for item in a:\n",
|
|||
|
" file.write(f\"{item}\\n\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"from transformers import AutoTokenizer,AutoModel\n",
|
|||
|
"import torch\n",
|
|||
|
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
|||
|
"from sentence_transformers import SentenceTransformer\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"model_name = \"/home/zhangxj/models/acge_text_embedding\"\n",
|
|||
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
|||
|
"# model = AutoModel.from_pretrained(model_name)\n",
|
|||
|
"model = SentenceTransformer(model_name)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"(2, 1792)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def embedding(text):\n",
|
|||
|
" # inputs = tokenizer(text,return_tenors=\"pt\",padding=True,truncation=True,max_length=512)\n",
|
|||
|
" # with torch.no_grad():\n",
|
|||
|
" # outputs = model(**inputs)\n",
|
|||
|
" \n",
|
|||
|
" # embeddings = outputs.last_hidden_state.mean(dim=1)\n",
|
|||
|
" embeddings = model.encode(text,normalize_embeddings=True)\n",
|
|||
|
" \n",
|
|||
|
" return embeddings\n",
|
|||
|
"\n",
|
|||
|
"emb1 = embedding([\"你好,这里是中国\",\"欢迎你来到中国!\"])\n",
|
|||
|
"\n",
|
|||
|
"# from numpy.linalg import norm\n",
|
|||
|
"\n",
|
|||
|
"# cos_sim = lambda a,b:(a@b.T)/(norm(a)*norm(b))\n",
|
|||
|
"print(emb1.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"def remove_punctuation(text):\n",
|
|||
|
" # 正则表达式匹配中文标点和英文标点\n",
|
|||
|
" pstr = r\""#$&'()*+,-/:;@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。\"\n",
|
|||
|
" return re.sub(pstr, ' ', text)\n",
|
|||
|
"\n",
|
|||
|
"def get_ans_list(file_path):\n",
|
|||
|
" answers = []\n",
|
|||
|
" with open(file_path,\"r\",encoding=\"utf-8\") as file:\n",
|
|||
|
" for line in file.readlines():\n",
|
|||
|
" answers.append(line.strip())\n",
|
|||
|
" results = [remove_punctuation(ans) for ans in answers]\n",
|
|||
|
" return results"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"answers = get_ans_list(\"/home/zhangxj/WorkFile/LCA-GPT/QA/filters/answers.txt\")\n",
|
|||
|
"answer_rag = get_ans_list(\"/home/zhangxj/WorkFile/LCA-GPT/QA/eval/RAGpred.txt\")\n",
|
|||
|
"answer_qwen72 = get_ans_list(\"/home/zhangxj/WorkFile/LCA-GPT/QA/eval/Qwen72b.txt\")\n",
|
|||
|
"answer_glm = get_ans_list(\"/home/zhangxj/WorkFile/LCA-GPT/QA/eval/GLMpred.txt\")\n",
|
|||
|
"answer_baidu = get_ans_list(\"/home/zhangxj/WorkFile/LCA-GPT/QA/eval/ERNIEpred.txt\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"emb_ans = embedding(answers)\n",
|
|||
|
"emb_rag = embedding(answer_rag)\n",
|
|||
|
"emb_qwen72 = embedding(answer_qwen72)\n",
|
|||
|
"emb_glm = embedding(answer_glm)\n",
|
|||
|
"emb_baidu = embedding(answer_baidu)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"numpy.ndarray"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"type(emb_ans)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### 指标1:余弦相似度,计算所有回答的cos_sim,取平均\n",
|
|||
|
"def cos_sim(target,pred):\n",
|
|||
|
" ''' ans,pred的数据格式是numpy.narray'''\n",
|
|||
|
"\n",
|
|||
|
" cos_sim_list = []\n",
|
|||
|
" for i in range(target.shape[0]):\n",
|
|||
|
" dot_product = np.dot(target[i],pred[i])\n",
|
|||
|
" norm_target = np.linalg.norm(target[i])\n",
|
|||
|
" norm_pred = np.linalg.norm(target[i])\n",
|
|||
|
"\n",
|
|||
|
" cos = dot_product/(norm_target*norm_pred)\n",
|
|||
|
" cos_sim_list.append(cos)\n",
|
|||
|
" avg_cos_sim = np.mean(cos_sim_list)\n",
|
|||
|
"\n",
|
|||
|
" return avg_cos_sim"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"cos RAG: 0.77057713\n",
|
|||
|
"cos GLM: 0.73152065\n",
|
|||
|
"cos Baidu: 0.73055154\n",
|
|||
|
"cos Qwen72b: 0.7132188\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"cos_sim_rag = cos_sim(emb_ans,emb_rag)\n",
|
|||
|
"cos_sim_glm = cos_sim(emb_ans,emb_glm)\n",
|
|||
|
"cos_sim_baidu = cos_sim(emb_ans,emb_baidu)\n",
|
|||
|
"cos_sim_qwen72b = cos_sim(emb_ans,emb_qwen72)\n",
|
|||
|
"print(\"cos RAG:\",cos_sim_rag)\n",
|
|||
|
"print(\"cos GLM:\",cos_sim_glm)\n",
|
|||
|
"print(\"cos Baidu:\",cos_sim_baidu)\n",
|
|||
|
"print(\"cos Qwen72b:\",cos_sim_qwen72b)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### f1值\n",
|
|||
|
"import jieba\n",
|
|||
|
"import collections\n",
|
|||
|
"\n",
|
|||
|
"def cal_f1(target,pred):\n",
|
|||
|
" target_token = list(jieba.cut(target,cut_all=False))\n",
|
|||
|
" pred_token = list(jieba.cut(pred,cut_all=False))\n",
|
|||
|
"\n",
|
|||
|
" common = collections.Counter(target_token) & collections.Counter(pred_token)\n",
|
|||
|
" num_same = sum(common.values())\n",
|
|||
|
" if len(target_token) == 0 or len(pred_token) == 0:\n",
|
|||
|
" return int(target_token == pred_token)\n",
|
|||
|
" if num_same == 0:\n",
|
|||
|
" return 0\n",
|
|||
|
" precision = 1.0*num_same/len(pred_token)\n",
|
|||
|
" recall = 1.0*num_same/len(target_token)\n",
|
|||
|
" f1 = (2.0*recall*precision) /(precision+recall)\n",
|
|||
|
"\n",
|
|||
|
" return f1\n",
|
|||
|
"\n",
|
|||
|
"def calf1_all(target,pred):\n",
|
|||
|
" f1s = []\n",
|
|||
|
" for tar,pre in zip(target,pred):\n",
|
|||
|
" f1 = cal_f1(tar,pre)\n",
|
|||
|
" f1s.append(f1)\n",
|
|||
|
" return np.mean(f1s)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"{'RAG': 0.3687708615388102, 'GLM': 0.28978494097380453, 'Baidu': 0.28271629300303913, 'Qwen': 0.22956414318317087}\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"f1_dict = dict()\n",
|
|||
|
"f1_dict['RAG'] = calf1_all(answers,answer_rag)\n",
|
|||
|
"f1_dict['GLM'] = calf1_all(answers,answer_glm)\n",
|
|||
|
"f1_dict['Baidu'] = calf1_all(answers,answer_baidu)\n",
|
|||
|
"f1_dict['Qwen'] = calf1_all(answers,answer_qwen72)\n",
|
|||
|
"print(f1_dict)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### 指标2:BLEU 支持中文?\n",
|
|||
|
"import nltk\n",
|
|||
|
"from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction\n",
|
|||
|
"import jieba\n",
|
|||
|
"\n",
|
|||
|
"''' 到时候需要遍历整个文档的每一行进行计算,之后统计平均值'''\n",
|
|||
|
"def Recall(target,pred):\n",
|
|||
|
" ''' 直接传入文本格式的答案和预测结果'''\n",
|
|||
|
" # 文本分解为句子\n",
|
|||
|
" target_list = list(target)\n",
|
|||
|
" pred_list = list(pred)\n",
|
|||
|
" \n",
|
|||
|
" # print(target_list)\n",
|
|||
|
"\n",
|
|||
|
" smooth = SmoothingFunction()\n",
|
|||
|
" # 计算bleu\n",
|
|||
|
" score = sentence_bleu([target_list],pred_list,smoothing_function=smooth.method2)\n",
|
|||
|
" return score\n",
|
|||
|
"\n",
|
|||
|
"def bleu_mean(target,pred):\n",
|
|||
|
" ''' 列表'''\n",
|
|||
|
" bleu = []\n",
|
|||
|
"\n",
|
|||
|
" for tar,pre in zip(target,pred):\n",
|
|||
|
" recall = Recall(tar,pre)\n",
|
|||
|
" bleu.append(recall)\n",
|
|||
|
" return np.mean(bleu)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"bleu RAG: 0.2116179659793689\n",
|
|||
|
"bleu GLM: 0.11521748257475595\n",
|
|||
|
"bleu Baidu: 0.11631696686819416\n",
|
|||
|
"bleu Qwen72b: 0.08817520468716526\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"bleu_rag = bleu_mean(answers,answer_rag)\n",
|
|||
|
"bleu_glm = bleu_mean(answers,answer_glm)\n",
|
|||
|
"bleu_baidu = bleu_mean(answers,answer_baidu)\n",
|
|||
|
"bleu_qwen72b = bleu_mean(answers,answer_qwen72)\n",
|
|||
|
"print(\"bleu RAG:\",bleu_rag)\n",
|
|||
|
"print(\"bleu GLM:\",bleu_glm)\n",
|
|||
|
"print(\"bleu Baidu:\",bleu_baidu)\n",
|
|||
|
"print(\"bleu Qwen72b:\",bleu_qwen72b)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"['生命周期分析旨在评估产品或服务从原材料获取到最终处置的环境影响。',\n",
|
|||
|
" '研究范围包括定义系统边界,如输入、输出、功能单位和分析阶段。',\n",
|
|||
|
" '医疗废物应严格遵循分类收集、安全贮存、密闭运输、集中处置的原则,确保无害化处理,防止疾病传播和环境污染。数据清单需收集所有过程的输入输出数据,包括资源消耗、排放和能源使用。',\n",
|
|||
|
" '该阶段旨在量化每个阶段对环境的各种影响,如气候变化、水耗和土地使用。',\n",
|
|||
|
" '结果分析揭示了不同阶段的环境热点,有助于制定减少负面影响的策略。']\n",
|
|||
|
"['生命周期分析(LCA)旨在全面评估产品、服务或活动全周期内的环境负荷,涵盖资源消耗、能源利用及废物排放等,为环境、社会、经济决策提供综合信息,促进可持续性的理解与优化。',\n",
|
|||
|
" '在LCA中,研究范围界定涉及两维度:一是全生命周期过程,涵盖系统边界如“摇篮到坟墓”等;二是自然资源影响类型,通过选择LCIA指标如资源、气候变化、大气环境、水体及土壤毒性等来确定。两维度共同明确LCA研究范畴。',\n",
|
|||
|
" '医疗废物需严格分类收集、安全贮存、密闭运输并集中处置,以保障无害化处理,避免疾病传播与环境污染。',\n",
|
|||
|
" 'LCA数据清单收集涵盖产品全生命周期:原材料获取、生产、使用、维护及废弃处理,需详细记录所有物质与能量的输入输出,含材料成分、能耗、排放种类及数量。',\n",
|
|||
|
" '生命周期影响评价旨在全面评估产品、服务或过程的全生命周期环境潜在影响,涵盖资源消耗、人类健康及生态影响,为减轻环境负担策略的制定提供依据。']\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pprint\n",
|
|||
|
"pprint.pprint(answers[:5])\n",
|
|||
|
"pprint.pprint(answer_rag[:5])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### 指标3:Rouge-l,Rouge-w\n",
|
|||
|
"from rouge_chinese import Rouge\n",
|
|||
|
"import jieba\n",
|
|||
|
"\n",
|
|||
|
"def calRouge(target,pred):\n",
|
|||
|
" ''' 传入的是文档列表,越大越好''' \n",
|
|||
|
" f = 0.0\n",
|
|||
|
" p = 0.0\n",
|
|||
|
" r = 0.0\n",
|
|||
|
" for targ,pre in zip(target,pred):\n",
|
|||
|
" target_cut = ' '.join(jieba.cut(targ,cut_all=False))\n",
|
|||
|
" pred_cut = ' '.join(jieba.cut(pre,cut_all=False))\n",
|
|||
|
"\n",
|
|||
|
" rouger = Rouge()\n",
|
|||
|
" scores = rouger.get_scores(pred_cut,target_cut)\n",
|
|||
|
"\n",
|
|||
|
" rougeL = scores[0]['rouge-l']\n",
|
|||
|
"\n",
|
|||
|
" f += rougeL['f']\n",
|
|||
|
" p += rougeL['p']\n",
|
|||
|
" r += rougeL['r'] \n",
|
|||
|
" length = len(answer_rag)\n",
|
|||
|
" return f/length,p/length,r/length"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"## 指标3:rouge\n",
|
|||
|
"from rouge_score import rouge_scorer\n",
|
|||
|
"\n",
|
|||
|
"def rouge(predict, target):\n",
|
|||
|
"\n",
|
|||
|
" scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)\n",
|
|||
|
" # Calculate the ROUGE score\n",
|
|||
|
" score = scorer.score(predict, target)\n",
|
|||
|
" # Extract the F1 score for ROUGE-1\n",
|
|||
|
" rouge_score = score['rougeL'].fmeasure\n",
|
|||
|
" return rouge_score\n",
|
|||
|
"\n",
|
|||
|
"def rouge_all(target,pred):\n",
|
|||
|
" rouges = []\n",
|
|||
|
" for tar,pre in zip(target,pred):\n",
|
|||
|
" score = rouge(pre,tar)\n",
|
|||
|
" rouges.append(score)\n",
|
|||
|
" return np.mean(rouges)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"{'rag': 0.3427058377450959, 'glm': 0.147907771037772, 'baidu': 0.19862760853771455, 'qwen': 0.1585418622278237}\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"rouge_dict = dict()\n",
|
|||
|
"rouge_dict[\"rag\"] = rouge_all(answers,answer_rag)\n",
|
|||
|
"rouge_dict[\"glm\"] = rouge_all(answers,answer_glm)\n",
|
|||
|
"rouge_dict[\"baidu\"]= rouge_all(answers,answer_baidu)\n",
|
|||
|
"rouge_dict[\"qwen\"] = rouge_all(answers,answer_qwen72)\n",
|
|||
|
"\n",
|
|||
|
"print(rouge_dict)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### 指标4:METEOR\n",
|
|||
|
"from nltk.translate.meteor_score import meteor_score\n",
|
|||
|
"import jieba\n",
|
|||
|
"\n",
|
|||
|
"def calMeteor(target,pred):\n",
|
|||
|
" meteors = []\n",
|
|||
|
" \n",
|
|||
|
" for targ,pre in zip(target,pred):\n",
|
|||
|
" target_list = list(jieba.cut(targ,cut_all=False))\n",
|
|||
|
" pred_list = list(jieba.cut(pre,cut_all=False))\n",
|
|||
|
" \n",
|
|||
|
" meteor = meteor_score([target_list], pred_list)\n",
|
|||
|
" meteors.append(meteor)\n",
|
|||
|
" \n",
|
|||
|
" return np.mean(meteors)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"meteor_rag = calMeteor(answers,answer_rag)\n",
|
|||
|
"meteor_glm = calMeteor(answers,answer_glm)\n",
|
|||
|
"meteor_baidu = calMeteor(answers,answer_baidu)\n",
|
|||
|
"meteor_qwen = calMeteor(answers,answer_qwen72)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RAG: 0.33359169849787107\n",
|
|||
|
"glm: 0.29077378118461694\n",
|
|||
|
"baidu: 0.30964530404786356\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(\"RAG:\",meteor_rag)\n",
|
|||
|
"print(\"glm:\",meteor_glm)\n",
|
|||
|
"print(\"baidu:\",meteor_baidu)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### 困惑度 越小越好\n",
|
|||
|
"import math\n",
|
|||
|
"\n",
|
|||
|
"def cal_perplexity(target,pred):\n",
|
|||
|
" '''单行文本字符串'''\n",
|
|||
|
" # 分词\n",
|
|||
|
" target_token = list(jieba.cut(target,cut_all=False))\n",
|
|||
|
" pred_token = list(jieba.cut(pred,cut_all=False))\n",
|
|||
|
"\n",
|
|||
|
" # 频率\n",
|
|||
|
" token_frequency = {}\n",
|
|||
|
" total_token = 0\n",
|
|||
|
" for token in target_token:\n",
|
|||
|
" token_frequency[token] = token_frequency.get(token,0)+1\n",
|
|||
|
" total_token += 1\n",
|
|||
|
" \n",
|
|||
|
" # 计算困惑度\n",
|
|||
|
" log_sum = 0\n",
|
|||
|
" for token in pred_token:\n",
|
|||
|
" frequency = token_frequency.get(token,0)\n",
|
|||
|
"\n",
|
|||
|
" if frequency == 0:\n",
|
|||
|
" probability = 1/(total_token + 1)\n",
|
|||
|
" else:\n",
|
|||
|
" probability = frequency/total_token\n",
|
|||
|
" log_sum += math.log2(probability)\n",
|
|||
|
" \n",
|
|||
|
" if len(pred_token) > 0:\n",
|
|||
|
" perplexity = 2**(-log_sum / len(pred_token))\n",
|
|||
|
" else:\n",
|
|||
|
" perplexity = 0\n",
|
|||
|
" return perplexity\n",
|
|||
|
"\n",
|
|||
|
"def cal_perplexity_all(target,pred):\n",
|
|||
|
" \n",
|
|||
|
" perp_mean = []\n",
|
|||
|
" for tar,pred in zip(target,pred):\n",
|
|||
|
" perp = cal_perplexity(tar,pred)\n",
|
|||
|
" perp_mean.append(perp)\n",
|
|||
|
" \n",
|
|||
|
" return perp_mean"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# 可视化\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"def Visilize(rag,glm,baidu,qwen,index:str):\n",
|
|||
|
" plt.figure(figsize=(10,6))\n",
|
|||
|
"\n",
|
|||
|
" plt.plot(rag,label='RAG',color='blue')\n",
|
|||
|
" plt.plot(glm,label='GLM-4.0',color='green')\n",
|
|||
|
" plt.plot(baidu,label='Ernie-3.5-8k',color='pink')\n",
|
|||
|
" plt.plot(qwen,label='Qwen1.5-72b',color='yellow')\n",
|
|||
|
"\n",
|
|||
|
" plt.legend()\n",
|
|||
|
"\n",
|
|||
|
" # 添加标题和轴标签\n",
|
|||
|
" plt.title('Comparison of the four models')\n",
|
|||
|
" plt.xlabel('answer')\n",
|
|||
|
" plt.ylabel(index)\n",
|
|||
|
"\n",
|
|||
|
" plt.show()\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RAG : [16.786638028689584, 17.125955631223814, 20.736037651049934, 19.26841443888935, 20.652390300741697, 17.502944402346472, 19.593834605324616, 29.17766892588523, 21.206677854726262, 18.690592468508477, 18.214512843337076, 13.872886842958078, 16.2878510142501, 18.511767544763405, 16.75047893388824, 16.39677836211086, 16.876774544766548, 18.434901377239807, 9.8259319385269, 15.743917370123441, 14.709946780872098, 12.360007858523181, 18.551149811344185, 23.66841028783834, 11.98904714558483, 15.805827983787744, 21.967010554281103, 19.860058999623103, 15.828819571743407, 21.397270953884853, 24.29048269055959, 12.71061340864174, 19.012603368710344, 16.14992793023593, 9.681012305078742, 9.213207249994932, 17.54807492359868, 20.8586901415855, 22.491741389645526, 16.209211838409622, 17.178101615259603, 15.872733511076348, 11.868371903750413, 15.794803886678576, 11.74178451160269, 17.64259544970864, 18.596283187738372, 18.981155141907614, 21.771799975682192, 16.97322517768421, 22.01213064585448, 19.69478983036272, 21.605573982972683, 19.694596036351523, 20.93426241370464, 18.783059434819116, 17.908778470884812, 18.13507326168101, 9.688861611972634, 12.683739667978378, 19.811252965753777, 16.680963344654774, 17.32351565513007, 9.784371478446857, 12.731365219848131, 13.820720984608187, 15.819456403097723, 21.389811204092165, 22.55598993170901, 22.664057334938985, 15.66184882865933, 17.303808958856678, 18.766844745185956, 21.552398240342587, 17.314713224890195, 19.174459070996722, 18.19485674182333, 21.009019051466623, 17.538363133995187, 23.979747841720915, 18.2705939251152, 15.7698593610198, 19.26022848842482, 14.185727604779101, 30.523003034526244, 24.69712985735353, 21.487629845089756, 22.7059408215605, 20.233255992457508, 11.922905269513365, 15.12963400146016, 11.48912529307605, 23.3038390748926, 20.72823734158085, 21.33631552974422, 22.243185095222387, 22.338760830540412, 25.05354712168199, 26.680810096032502, 10.432700717216584, 10.417656074265825, 17.510326541179737, 19.72665285174452, 19.53204198841079, 20.810577835739533, 21.923980316176014, 16.434107016131662, 20.95698264197173, 15.763469676891708, 21.70351253598207, 18.73321818109027, 14.753670686770853, 12.750175863081799, 14.536893611755875, 24.377523792173974, 23.22474620543287, 24.964811115892783, 18.327344232868803, 21.818494961138843, 21.268373841453055, 15.91345637502419, 17.89019267238991, 21.091972791778385, 15.558455935731923, 23.411394831333155, 23.97931582221641, 20.36003852135054, 30.692955052075778, 14.96733944465267, 21.44424846210027, 16.87132868904038, 20.21326012716247, 25.495097567963935, 23.441148000683622, 22.82346660567824, 17.416017044457693, 14.849314214248201, 22.445496728405306, 22.423641679586925, 19.677356256993363, 16.357248780374757, 10.798941728458589, 11.64395693009412, 18.061736182439173, 15.790643727358791, 16.215109047272886, 13.602721148237995, 18.94204523721685, 16.58828108757385, 12.276786047163906, 26.19104161613042, 13.728040781985483, 13.822389246390024, 17.48863096197165, 22.77920043066136, 20.341187113879954, 6.953241443415772, 8.772480183906334, 16.8249482352289, 10.902149192706355, 13.299837411209728, 9.859467764575994, 21.0332193169151, 17.17234297302958, 18.488990946182803, 11.6915003664928, 26.269797012276836, 25.668381145017495, 23.75240914955065, 20.623614048234895, 25.952252906329665, 18.763457126268353, 23.698244460650063, 22.26057856647635, 17.63533254338961, 14.722318854069506, 26.700581415337968, 27.02200010977567, 14.694879988318286, 16.16310798867607, 19.984577482116762, 20.78540014647609, 21.018603825245417, 20.334052812160525, 18.43127032041259, 16.82647927092474, 20.768991517517826, 17.075524968768516, 18.55001376919892, 17.466051934410945, 21.626155831166454, 25.659307140709846, 22.297225962953373, 13.729615728767119, 13.990605277680656, 17.719277905455844, 12.08031025366726, 9.608362466184177, 11.79297923587377, 14.592703837258341, 12.729033504462974, 19.30297692273523, 12.193648376899137, 32.1822225732771, 34.283642751331676, 44.096295320936406, 29.898354267849932, 33.20904193122234, 29.662920661035
|
|||
|
"GLM : [16.6430835283622, 17.32336404962201, 20.769909982821744, 19.299137491444444, 20.269425165431755, 16.72039599056918, 19.709032763791168, 26.530751837265875, 21.705726788712777, 20.321769250288842, 17.25640399509859, 13.841289296747153, 16.810212056425286, 18.800609273114656, 16.795120867813498, 14.77904023532512, 16.769327228171772, 18.523763628704586, 9.8067790505884, 15.823499796057645, 14.441472786438785, 12.52946780933541, 19.142613070237488, 24.525800119323026, 12.438189976893675, 15.814466684883804, 20.855646248738058, 19.207004092921284, 15.685405840403316, 20.92783350661157, 22.724680142908152, 12.219971730591931, 17.1088208322202, 16.5639949909548, 9.654893846056314, 9.773558559639953, 17.295526309663543, 21.222595649084983, 22.8521792097667, 16.728757771990246, 17.00811974275446, 16.186622918102305, 11.827232998382819, 15.79017105131936, 11.831869509781953, 17.75564407693614, 18.77617219079646, 19.285155546560834, 21.680687276529735, 18.37912330632793, 19.730268729055574, 21.470289942246563, 21.46920204490613, 19.635110114401257, 21.904171976261978, 19.554537355466056, 17.747599501400902, 20.27770008559826, 9.640162420817731, 12.760672937935308, 21.473302110811566, 16.5442198897974, 15.956568946480646, 9.810258614868584, 12.752650184589397, 13.756705960921325, 14.938786773415353, 20.866801273913044, 22.6437015930689, 23.325063933394635, 15.655401487683545, 17.708872684612086, 19.577733443164234, 21.61614059227018, 17.825326941893984, 18.728141754497027, 17.915087026929505, 22.817160938764584, 17.102591798671863, 23.40773461913577, 18.55195387069204, 15.81666271146499, 17.154174176218017, 13.558642225341748, 29.550059558753603, 24.606252354962432, 21.507283576213975, 21.338633628027623, 20.076227895893773, 11.845789952459894, 15.740530770600115, 11.48912529307605, 21.40249887626265, 20.375862202763454, 22.93619056061466, 22.33233483250192, 22.28452331460819, 24.675232014182285, 25.39199764679094, 10.432700717216584, 10.649433928954249, 16.472503480852314, 19.705823687869348, 18.376183328200263, 20.753076881651673, 21.667323684779063, 16.786633980263183, 22.205380099241534, 15.849596785671523, 21.69510043544591, 18.660641479448316, 14.658972088782363, 12.716973698889385, 14.574933524734213, 25.21017602348153, 21.949040579355515, 25.789719425637834, 17.37359733871246, 23.485774693328228, 19.059130657100447, 17.208358745145002, 18.38040780758452, 21.86973522740544, 16.60610304921727, 24.32040573379769, 24.769943475309237, 22.213142483634808, 30.319149174415607, 15.150262307212556, 21.445910953402503, 17.356548744905385, 20.493901531919203, 25.537988000946015, 22.486403795990377, 22.76880830726555, 17.800295404717982, 14.502254852879494, 23.46147835784416, 30.680661071984897, 19.724082274803756, 16.10406866929678, 10.851241884379677, 11.669034474480345, 18.025374415076968, 15.794803886678581, 16.021842011745523, 14.04614317958926, 19.827821788703055, 16.16001514199078, 12.782771159802298, 36.952044519936635, 13.814487973813142, 14.045278774624428, 17.360297542754175, 22.593032163453678, 21.24519519741701, 6.9043158080450695, 8.7742420817813, 16.853404498811052, 10.862912446904126, 13.931003140246336, 9.79148362360977, 20.72251496203841, 17.724068850822743, 17.6948936521326, 11.799601245101115, 26.115992893026093, 26.449703405165074, 24.20077442089979, 25.304004914437535, 26.853584812102575, 18.68249255369359, 23.340425529882033, 22.46653746301892, 16.691616905083684, 14.816331401708167, 24.173320582278418, 27.441956448044678, 14.647923090110504, 15.345164818391531, 19.88783511325927, 20.594135937758217, 21.415791911047226, 20.65616980437443, 19.91548321615108, 16.742289916316068, 20.9058324501519, 18.436932780404376, 20.737246425445477, 17.24460360048404, 21.387937084356995, 25.834063395187364, 23.061614157836683, 13.828144902484995, 13.638563363963314, 17.812362084224066, 12.675521757540945, 10.126292913884267, 11.804077680542967, 18.322440426015127, 12.624829119995987, 19.544923966611073, 12.781139417682224, 32.91204446163059, 39.07696291054802, 39.87505605479412, 33.31648339498472, 36.07209554041437
|
|||
|
"Baidu : [16.78600515282015, 17.388953861994104, 20.263958700578666, 19.399796043574675, 20.34432537501564, 17.56476407041201, 19.589049321538386, 27.19755345257001, 21.803592275530665, 20.586458840048643, 17.093957224185587, 13.778647775932704, 16.720136060706736, 18.302514395392482, 16.700590170537023, 15.788196586946007, 16.629330387028187, 18.8672131105584, 9.709299607098561, 15.679299560834897, 14.725597706689648, 12.43825117369412, 18.327370687176987, 24.7001207399045, 12.023368886443686, 15.811455487086286, 21.959901671648993, 20.357243598387164, 15.73740532531179, 19.958936861586718, 22.32019225984022, 11.517467221747458, 17.43084729568591, 16.671936528173013, 9.68693572141454, 9.710036915678335, 17.757822390999962, 20.495512415159542, 21.71392851916247, 16.72799825011015, 16.518340870602795, 15.866064903823448, 11.744045058465849, 15.787346818643233, 11.840433998908527, 17.699163061753886, 18.593268364013944, 19.10093726849237, 21.70182985497743, 18.82955623772154, 20.884060041761465, 20.12521212730321, 21.473363896641054, 19.718253228914833, 21.063772487137644, 19.51745151446268, 17.71115821031251, 19.892453240913937, 9.721987038641945, 12.61337564249394, 21.448954408291957, 16.659908371007926, 16.83068756299048, 9.81843287944558, 12.627069951892109, 13.73479908716497, 15.519568534524026, 20.297963442847113, 22.435478404450823, 23.77185444074187, 15.764268342067247, 17.375485920218964, 19.110378596982233, 21.613552373492528, 17.66110271162761, 18.750877211164983, 18.281503469582706, 21.56602540394998, 17.569937623831084, 23.97578413781389, 18.727064153120203, 15.723769773750684, 18.352794549952694, 15.542896711025309, 29.57011538075551, 23.61448491474835, 21.318731233242634, 21.835720237789985, 19.749604453981156, 11.80581275577417, 15.635426208015438, 11.841012481577438, 22.09101058641589, 20.911798768144493, 23.166026857181027, 22.102246125173338, 22.482088919764834, 24.316442347645363, 24.22278695015667, 10.631868179511319, 10.60911966052156, 16.753147976956953, 19.673640762711603, 19.9807898594954, 20.78946167687961, 21.968492348660984, 16.281778060352746, 20.789676833073912, 15.779327322846136, 21.716829485655065, 18.70874762023645, 14.62950361480577, 12.694297499023577, 14.788439880185285, 25.479138326359, 23.13962250191691, 26.426633183836344, 17.594184731915373, 23.609646842612392, 20.05563231911823, 18.084436354070956, 17.975513594642223, 21.05906761845089, 16.499910154259304, 23.50804637065224, 22.788784447229624, 20.55566242956213, 30.288232050043053, 14.89329265307209, 21.448954408291797, 17.131616481541005, 20.668537192199175, 25.423773527564208, 23.13615875819889, 22.767052452590026, 17.071071070512772, 14.281100163910569, 26.340874815451933, 30.65220756672719, 19.59383460532464, 15.738221485112595, 10.815765748385124, 11.795365810730047, 17.76666030335957, 15.74391737012346, 16.20575239629905, 13.743452445808458, 19.098983937643645, 16.171049866759798, 12.408506455803238, 27.592974744357996, 13.726046108747049, 14.024228822805572, 17.71625608065323, 22.958422349453087, 20.528510701887264, 6.9628899326291736, 8.757518329279005, 16.823232584904854, 10.75628377172455, 13.788427647650918, 9.845456308146067, 21.182451702317834, 17.741045192365117, 18.10414210521813, 11.696380337701086, 25.281610662077583, 26.15239044113751, 23.5827416539145, 24.19937667065778, 26.198901367991386, 18.72481054363292, 23.256216743535823, 22.001369389947797, 16.884729757014746, 14.86032983900312, 25.984085885771126, 26.228867434005284, 14.616892410322329, 16.36604389018134, 20.353456237244675, 20.643782786066012, 21.307316303438242, 18.923667437748303, 18.839631507568356, 16.80008842108939, 20.95608567027042, 17.672547594456564, 19.660548946966134, 17.490670355806202, 21.400169636195848, 26.12510819831057, 21.74990392654459, 13.758736141739188, 14.332945424439407, 17.56442052447247, 12.730683597329739, 9.960486609751998, 11.83096222324923, 17.67947546824424, 12.688473608833355, 19.609302743483088, 12.824497378834604, 33.61691572893698, 33.00555322646526, 36.69880407912848, 32.54626495707527, 34.935976713322205, 31.
|
|||
|
"Qwen : [16.49242250247065, 16.686649181199865, 19.910925559898555, 19.5383349733967, 20.279349481847728, 16.992421034786176, 19.70378182070244, 27.981965103119485, 21.767998948947, 20.2984127173934, 17.320175649296793, 13.871604459597808, 16.70653380732574, 18.755967397663742, 16.77791016964212, 16.151209901324066, 16.856408091748314, 18.917210501413205, 9.748888167326227, 15.718535623285264, 14.355679480528892, 11.818902475157167, 18.167358396344063, 25.12099574223989, 12.349898617740575, 15.847236919590722, 20.83562783484754, 20.308807358024016, 15.809373077382462, 19.663582809455196, 22.996836940378767, 11.943138710618154, 17.52949504624801, 16.39274027992318, 9.619266947682576, 9.758959788362358, 17.20497445000673, 22.363847355401703, 22.576880096973415, 16.66877038196146, 16.75516850717909, 16.09626102992447, 11.851760868921113, 15.787775290402184, 11.894285029976214, 17.719950134458838, 18.765463472133984, 19.223379130273194, 21.703512535982203, 18.23508856544311, 21.892069098043763, 22.044397835771296, 21.57925115158319, 19.716136675268093, 20.71422073841603, 19.024111973746766, 17.777364206407945, 20.47268562965599, 9.678929486723666, 12.746072596899744, 21.539195012857533, 16.659908371007933, 17.112863094099886, 9.685768701331515, 12.806352665142605, 13.817339255513334, 15.990759917852293, 21.446406911912916, 22.125905769300036, 23.356195154786448, 17.580045977431553, 17.8993335157474, 19.086555426989882, 21.77092686864209, 17.913392859566443, 19.318370024725073, 17.76793154993539, 21.231557533963343, 17.44530785649763, 24.16534108424743, 18.43022558405518, 15.767643977267447, 17.653814584243126, 15.209690027257714, 29.413946913902155, 25.92999284216253, 21.585410884683334, 21.275284951689468, 20.46768832352362, 11.770198525823654, 15.72898197139594, 11.60505449529102, 22.31211129797026, 19.848509476769557, 23.008947915847635, 20.656397392885424, 21.662117548105403, 23.282670521373202, 25.80521035540313, 10.432700717216584, 10.717755802949265, 16.618500604522946, 19.65494953891497, 20.440309304570036, 20.726532284964573, 21.775650060921404, 17.303371565789472, 22.29546199964429, 15.867126847247954, 21.758485199707504, 18.71871608779327, 14.674303806584257, 12.8063526651426, 14.754555842110022, 25.753596949062693, 23.066721343607547, 26.53539690883856, 16.89111907297431, 24.539369488196556, 20.18446442830011, 18.096808234090712, 18.488298102742284, 21.48716989104014, 16.64786596728707, 23.532999641369948, 24.029458040334486, 22.13809603590682, 30.110364902199322, 14.92204669546916, 21.004825943728047, 16.524676356690005, 20.6281287367164, 25.395299811998573, 22.16682645861641, 23.000000000000018, 16.553678181264814, 14.655784519293372, 26.751056308740644, 30.49472266812742, 19.663423357589668, 15.837240760492847, 10.857959766999015, 11.699292050858404, 18.66745609270129, 15.810323708656714, 15.710158534963808, 14.823634781480862, 18.085638320336606, 16.458333795180394, 12.244332200772382, 34.74595345009185, 13.752443556234839, 13.190832846841799, 17.236436342545417, 22.36287808671842, 22.017160324449197, 6.964901719114079, 8.844566293473695, 16.840826297428382, 10.707443858091125, 13.8393491744441, 9.835796556046425, 21.1754812828076, 17.785941981726413, 16.883331962117833, 11.793413201223768, 25.193860766027814, 26.332184587404498, 23.51756132987782, 25.399132460629872, 26.88195839734064, 18.740614656656902, 22.8580196044982, 21.203180905300812, 16.82598436073685, 14.848085360467028, 25.17075859770986, 26.99574913719904, 14.644321956667975, 16.066701523621145, 20.00738165565782, 20.597927952011077, 19.566843004070375, 19.947967369925458, 19.59379966945489, 16.710028689079113, 21.124109527502075, 17.502730377462957, 19.58309392676658, 17.30730789014221, 21.156635639515166, 26.05944354759675, 22.329878281539983, 13.428396348859247, 14.38487335443563, 17.752003032901264, 12.781504141961284, 10.151718491555778, 11.830351905999217, 17.94203939294091, 12.793545883735742, 19.686499045755685, 12.825413468725145, 33.739362162356045, 35.829487797357736, 34.905839129874465, 31.870804784084292, 34.56705484714001, 31
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"perp_rag = cal_perplexity_all(answers,answer_rag) \n",
|
|||
|
"perp_glm = cal_perplexity_all(answers,answer_glm)\n",
|
|||
|
"perp_baidu = cal_perplexity_all(answers,answer_baidu)\n",
|
|||
|
"perp_qwen72 = cal_perplexity_all(answers,answer_qwen72)\n",
|
|||
|
"\n",
|
|||
|
"print(\"RAG :\",perp_rag)\n",
|
|||
|
"print(\"GLM :\",perp_glm)\n",
|
|||
|
"print(\"Baidu :\",perp_baidu)\n",
|
|||
|
"print(\"Qwen :\",perp_qwen72)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"/home/zhangxj/miniconda3/envs/Qwen/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
|
|||
|
" return self.fget.__get__(instance, owner)()\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"## 计算BERTscore\n",
|
|||
|
"from transformers import BertTokenizer, BertModel\n",
|
|||
|
"tokenizer = BertTokenizer.from_pretrained(\"/home/zhangxj/models/bert/bert-base-chinese\")\n",
|
|||
|
"model2 = BertModel.from_pretrained(\"/home/zhangxj/models/bert/bert-base-chinese\")\n",
|
|||
|
"\n",
|
|||
|
"def cosine_similarity(a, b):\n",
|
|||
|
" return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))\n",
|
|||
|
"\n",
|
|||
|
"def bert_score(reference, candidate, return_similarity_matrix=True):\n",
|
|||
|
" # 计算余弦相似度\n",
|
|||
|
" cosine_similarities = np.zeros((reference.shape[0], candidate.shape[0]))\n",
|
|||
|
" for i, c in enumerate(candidate):\n",
|
|||
|
" for j, r in enumerate(reference):\n",
|
|||
|
" cosine_similarities[i, j] = cosine_similarity(c, r)\n",
|
|||
|
" # 取每一行数据的最大余弦相似度\n",
|
|||
|
" max_similarities = cosine_similarities.max(axis=1)\n",
|
|||
|
" # 取所有余弦相似度的均值\n",
|
|||
|
" bertscore = max_similarities.mean()\n",
|
|||
|
" if return_similarity_matrix:\n",
|
|||
|
" return bertscore, cosine_similarities\n",
|
|||
|
" else:\n",
|
|||
|
" return bertscore"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def Bert_score_all(target,pred):\n",
|
|||
|
" '''文本嵌入列表'''\n",
|
|||
|
" scores = []\n",
|
|||
|
" for tar,pre in zip(target,pred):\n",
|
|||
|
" score = bert_score([tar],[pre])\n",
|
|||
|
" scores.append(score)\n",
|
|||
|
" return np.mean(scores)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"bertscore_rag = bert_score(emb_ans,emb_rag)\n",
|
|||
|
"bertscore_glm = bert_score(emb_ans,emb_glm)\n",
|
|||
|
"bertscore_baidu = bert_score(emb_ans,emb_baidu)\n",
|
|||
|
"bertscore_qwen = bert_score(emb_ans,emb_qwen72)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RAG : 0.8358214576902688\n",
|
|||
|
"GLM : 0.7974971954358191\n",
|
|||
|
"Baidu : 0.7998237824382414\n",
|
|||
|
"Qwen : 0.7911021765364117\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(\"RAG :\",bertscore_rag[0])\n",
|
|||
|
"print(\"GLM :\",bertscore_glm[0])\n",
|
|||
|
"print(\"Baidu :\",bertscore_baidu[0])\n",
|
|||
|
"print(\"Qwen :\",bertscore_qwen[0])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Building prefix dict from the default dictionary ...\n",
|
|||
|
"Loading model from cache /tmp/jieba.cache\n",
|
|||
|
"Loading model cost 1.688 seconds.\n",
|
|||
|
"Prefix dict has been built successfully.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"['生命', '生命周期', '周期', '分析', '目标', '研究', '范围', '医疗', '废物', '数据', '清单', '收集', '信息', '生命', '生命周期', '周期', '影响', '评价', '目标', '分析', '帮助', '制定', '环保', '策略', '利用', '提供', '建议', '纳米', '米粒', '粒子', '影响', '溴化锂', '溶液', '稳定', '稳定性', '定性', '沸腾', '温度', '中央', '中央空调', '空调', '系统', '排放', '何计', '计算', '生命', '生命周期', '周期', '期中', '种类', '类型', '空调', '系统', '运行', '排放', '比较', '空调', '系统', '单位', '单位制', '制冷', '制冷量', '每年', '年产', '产生', '排放', '排放量', '放量', '采用', '软件', '评估', '空调', '系统', '环境', '环境影响', '影响', '源热泵', '热泵', '系统', '生命', '生命周期', '周期', '评价', '优势', '变频', '变频空调', '空调', '调控', '控制', '控制系统', '系统', '混杂', '自动', '自动机', '动机', '控制', '控制策略', '策略', '特性', '变频', '变频空调', '空调', '控制', '控制策略', '策略', '相比', '优势', '压缩', '压缩机', '运转', '频率', '受到', '温度', '参数', '影响', '系统', '优点', '方法', '用于', '分析', '系统', '研究', '发现', '资源', '消耗', '系数', '系统', '火力', '火力发电', '发电', '相比', '一类', '污染', '污染物', '物排放', '排放', '放影', '影响', '较大', '铁路', '隧道', '生命', '生命周期', '周期', '划分', '分包', '包括', '理论', '用于', '建立', '绿色', '等级', '综合', '评价', '模型', '绿色', '铁路', '隧道', '定义', '绿色', '铁路', '隧道', '评价', '指标', '指标体系', '体系', '建立', '滚落', '灾害', '危险', '危险性', '评价', '缺失', '稳定', '稳定性', '定性', '评估', '稳定', '稳定性', '定性', '评估', '中用', '用于', '危险', '危险性', '等级', '划分', '工具', '稳定', '稳定性', '定性', '评估', '危险', '危险性', '等级', '等级分', '分为', '几个', '级别', '何需', '滚落', '灾害', '危险', '危险性', '评价', '崩塌', '要区', '区别', '滚落', '研究', '现有', '局限', '混凝土', '排放', '强度', '关系', '混凝土', '排放', '混凝土', '土生', '生命', '生命周期', '周期', '排放', '不确定性', '确定性', '定性', '范围', '混凝土', '土生', '生命', '生命周期', '周期', '排放', '过程', '排放', '不确定性', '确定性', '定性', '最大', '建筑', '建筑业', '国家', '排放', '比例', '混凝土', '土生', '生命', '生命周期', '周期', '评估', '包括', '哪几', '哪几个', '几个', '步骤', '研究', '发现', '添加', '粉煤', '粉煤灰', '煤灰', '影响', '混凝土', '排放', '混凝土', '排放', '分析', '国外', '生命', '生命周期', '周期', '评价', '汽车', '回收', '利用', '要求', '高效', '高效能', '效能', '塑料', '料车', '车门', '门框', '框架', '传统', '车门', '门框', '框架', '相比', '优势', '国内', '内家', '家电', '家电产品', '产品', '回收', '利用', '现状', '国内', '国内外', '内外', '电动', '电动汽车', '汽车', '生命', '生命周期', '周期', '评价', '深度', '上有', '差异', '国外', '饮料', '生命', '生命周期', '周期', '影响', '评价', '国外', '推动', '生命', '生命周期', '周期', '评价', '方法', '标准', '标准化', '国内', '国内外', '内外', '生命', '生命周期', '周期', '影响', '评价', '敏感', '敏感性', '感性', '分析', '差距', '国内', '国内外', '内外', '考虑', '时空', '因素', '生命', '生命周期', '周期', '影响', '评价', '差距', '国外', '生命', '生命周期', '周期', '影响', '评价', '数据', '数据库', '据库', '建设', '上有', '进展', '生命', '生命周期', '周期', '评价', '黄磷', '产品', '足迹', '量化', '黄磷', '产品', '足迹', '核算', '哪些项目', '项目', '目<><E79BAE>
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgMAAAFeCAYAAAAYIxzjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9dZxcV3rm/z2XiqGZuaVuMbMly8xsD3kwk2SSSTYbnMD+kt1ssruhDe0mk2STTAY9Yw95PGNG2ZZkMTM2MxTXpfP7o0ot6FarJbVsJ9PP59OG7qpb996655znvO/zPq+QUkpmMIMZzGAGM5jBTy2UD/oEZjCDGcxgBjOYwQeLGTIwgxnMYAYzmMFPOWbIwAxmMIMZzGAGP+WYIQMzmMEMZjCDGfyUY4YMzGAGM5jBDGbwU44ZMjCDGcxgBjOYwU85ZsjADGYwgxnMYAY/5ZghAzOYwQxmMIMZ/JRjhgzM4JogpYnjDiOlM83HdZHSREp3Wo87gxnMYAYzuDxmyMAMrgm208HIyO9hWnuRUuZ/XKR08v+emrGl6w7juqmx10uZJJl6Jn/cGUIwgxnMYAbvB7QP+gT+vSG3aGXJ3ToVIcQHfEbvP6R0yWTewLQO4Tr9WBzEto+TNXdiO+34vffj9z8EGFc8Vir1LJZ9jHD4t1BFAUL4MK29OE4fhr7oxl/MDGYwgylDShfH7UGgoyhFCDGzn/yPghkycJWQcoRY7C9wnE58vofw+e5FCM8N+iyJdIeRmChKCUKoN+Rzru6cXExzG8nUU6hqOan0c0iZRCghdG0WXs8GDGMRoE/hWBlM6yCuO4ogR6qE0FCVMlS1bGaiuQzSaZNDhzqZO7cKr1f/qSSkHySktDHNXdj2GTS9GVUpRVEKEML/U/Bd2CQS/0w2uxW//zEC/sdRlMgHfVIzmAbMkIGrhBARfL57icX+hETi7zGMRWha4zUfLxdpcJEyi5QppEzhukM4dju2cwYzuw1XxvD7HsPnf+gDHXi5SXA38eRXCAd/GY9nHSAQQkPKLKDmJ0UFuPKkaFnHsOwThEO/jBDnrys3qfpw3RiuGwNsFKUUIXw/BZPt5SGlJJHI8uZbh/ne93bwC1+4lZUrGwCu+r5IKbFtl1Qqy8BAgmDQQ3FxCFWdIWBXhoIrE4zG/wLXjaFp1WhqHbo+J0+GFyPElaNiHzRyqb00Uibz88pUiKWOEH5M6wBW7Bi2fZZI5HdRhO/9OOUZ3ECIma6FVw8pXWz7OGZ2M17ffShKMWAipYlAB+G94qBKp1/Eto+jiPAYAbDs4wjhQVNrEEoBivDjyjjJxFdRlAAFhX+Prs9/fy7yEkiZJZPdTDa7CY9nPYaxGNcZwLIOY9nHsZ0OdG02oeDPIIR3CsezicX/EinThEO/jhAGrpvEdQeIJ/4R142hKFFs5wyKCBMK/TKGvuh9iRZIKXGkRBXiqhdZ15UIcfWL85WOmUxmOH16gJ07TzMwkKClpYJFi2o4dbqf5cvqCYUmnoyllGQyFolEBtt2icXS9PbFSCazpNMm8ViGQ4c7cV3J53/mZlpbK6btvK8GUrpIJALl3wXhkzJNIvlVYvG/xXWH87/VUdUKIuHfwO97/EMf2ZJSkkp/j2TqaTzGavy+B9C0xkkjkFLajIz+AYnkvwESTW2kpPjbaFrN+3beM7gxmIkMXAHn1fICkPkfB1UtwfCsxLIO4DhdOPYZbKcTTa3DH/g4mlY36XGzmTfIZF7BH/gEmtaEbiwhO/ouihImEPnDsbSAlBaWuRvXHUVRSm7w1V4ettOJ644SDHwWVS0jnX6JrLkFKbMoSgFez3o8xnJARUp5hQldYtnHyWa3Eg7/FkJ4yWa3kDHfxbE7yWReRyhBQsGfJeB/AlUtzxOu92eRGE6lefPIKeZUlNJaUTLlxUlKyabjpynw+5hfVYaqXN9ikNu5QX9/jF27zpI1beobSnjwwaUUFPjZufMMzzyzjZLiEPPnV1/2OIcPd/GvX3mbFcvr8fs9bNlygoaGEh56aCker05LSwX/8I+vY9nXVxlyfl8hyUWMpv59xe0+OpI7KPbOosQz+98BIfASCHwGgZdY4u9wnE7AwnHaSCS+gtdzO6paeNVHPRcpzB2rD9vpRBF+dL0ln46c3vuiKiWY5l6y2a1ks28TCf8uhrHisvffdQexrMPkvmPQ9GYUpWhaz2kGHwxmyMAVkMk8j22fQVWrcuH7/MLvuoNIXJASITQUpRBVa0DTZyGE/4rHVdRiVK2aQODTqGopUsr8oLIQwjvGzoXQUdQyJG4u6vABQVOr0Xz1+d2OxOu7E693Y36C0nDcfrLZrdj20wT8H5t0pyBlhmTymyhqCY7TA6jo+gJ0vYVU+lmy5hYMfR6KiGAYy26YViKZNclYFgUBP8oFk59A8MaRUxzvHaS1IkfAspbNSDpDcdA/6SK/p62bA529/OVH7yPsG68lcV3JYDJJwGPgNyYPJUsJp0/30dcXZ+3aWdi2w7/861soiqCpsZRNbx/lscdW0NxcNulxVFVB0xQ+8pFVCCEYGEwgpaS2tij/OZKCggCh4JUjOhOfp8SSGVL2IEPZUzjSodq/BK8amfKirgsffZmjHIu9yp2Vf4Bfu/qF9P2EEAKBj0Dgk+j6HOKJfyST3YSUSRx3EFeOojK1a5DSylfVDGI7vVjWYUxzO5Z9DNcdRggvAf/jhIK/hBChaSNKQgg0rR5FKcBxYmTN90ikvkWhsYTLaX5Mcz+mdTj/f1peM3Dl+W4GH37MkIErwLKOkkp+FZ/vMVS1FE1rwmOszoXxlRCKEkaIyAX57KkNVCEMBOrY64UQCCWAdBPY9lmE6EFVq1GUIIoI4dJ34y5yStCQMo3rZvNpjRFspw3LPo7j9CDdGFlzF0J48HnvBiYmA1LapNI/yb3Oc0ueDLgoShTHOUsmuwmPsQaPZw2mtQfVrMHrWcONiAp0j8T561ff5b6FLdwxdxZaPl8e9BpUF0bw6trYxHuqf4g/f3ETjy9fwD0LJt65CiGoK4rSPjSKqkx8vinL4mubdzOUTPOlezYQ8V1+ARYCXBe+9a0t/NzPbaSlpZw7bp+Ppikkk1mefHItZaXhSRcHIQSKqmAYGlLCyEhyTCfwgx/sYOXKJgxDw+ebnJj0Z45zPPYq9cG1lHnn0Jney8GRZyn0NKILL0IoqEInZvVwZPRFGoPrWVf6i3jU0LhjSSlxsVFQx0LpXjXMquLPcya5GVtmSdqDZJ04KXsInxahwKhH+RAIaC+FEBoezyp0fQ7Z7FbSmZdR1CJU5TxBk9LGcfuRbiKvDUriOL3YTju2fRrbacN1BnIkwh0C7Es+RSGR/A66Nhef7wGmayxIKXGcHly3P/8bDVWtBCa+z1JmSKaeQcpY/jcuprkLr+fWPCH4sEdzbhxc183N4R/6iNblMUMGrggXVa0kGPrFC0LV5xdwyKUSUqmnAInP9yCKEr7yYaULqCBNstltuZ2AuR/XHSA2+oeoahWB4OfzYiQ/klxO9YODRTrzApnM60iZAuFF15rQtRZ8nltQ1WpGY3+c3+HPmfAIuUqEPVjWYULBnyOT3XyB2DBLIvk1PMZahNBR1RJU9U7iiX9AVYrQtOkPHdcURmgpL+aF/cdYUluJrqpE/V5UoeBRVbR8BGAwkeKpbXvRNZWySHDCY7lSksiYJLMmvbE4T2/fz6yyIlY31qCpKq4r6RqN8ezuQ2w6dppldVW47uTfpxCCpqZSfvEXb6WkJERHxzBbtpzg/vsXU11dgGk6xGIZPB4Nj0e77P2RrouqKnR3j/D220c5criLktIw5RVRvN7cDlDTlEm9IcJ6ObY02TH4DW4t/y18apQRs52W8N1U+RcDoAiVUbOLs4mtlPvmoSkT6xgcaXEy/iYhvQyBwHRTuR8nQdqNcWT0RVShIxDErB560gdpidzF3Mh9GErgQznhKkoIr/cOvN5b8785v6BKaZFI/Cup9LO4bgwpM4BDLtTuwti4FihKCYoSQVVLUdVqNLUGTatFU2vz42o6r90lk3kzN54BXWvG73vgslqHXDrhbUCiqtX
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import jieba\n",
|
|||
|
"import collections\n",
|
|||
|
"import re\n",
|
|||
|
"from wordcloud import WordCloud\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签\n",
|
|||
|
"plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号\n",
|
|||
|
"\n",
|
|||
|
"answer = []\n",
|
|||
|
"question = []\n",
|
|||
|
"with open(\"/home/zhangxj/WorkFile/LCA-GPT/QA/answer.txt\",\"r\",encoding=\"utf-8\") as f:\n",
|
|||
|
" text = f.readlines()\n",
|
|||
|
" for line in text:\n",
|
|||
|
" str = line.strip()\n",
|
|||
|
" answer.append(str)\n",
|
|||
|
"\n",
|
|||
|
"with open(\"/home/zhangxj/WorkFile/LCA-GPT/QA/filters/question.txt\",\"r\",encoding=\"utf-8\") as f:\n",
|
|||
|
" text = f.readlines()\n",
|
|||
|
" for line in text:\n",
|
|||
|
" str = line.strip()\n",
|
|||
|
" question.append(str)\n",
|
|||
|
"\n",
|
|||
|
"# df = pd.DataFrame({\"question\":questioin,\"answer\":answer})\n",
|
|||
|
"# print(df.head())\n",
|
|||
|
"# df.to_csv(\"/home/zhangxj/WorkFile/LCA-GPT/QA/QA.csv\",index=False,encoding=\"utf-8\")\n",
|
|||
|
"\n",
|
|||
|
"# 绘制词云\n",
|
|||
|
"data = ''.join(question)\n",
|
|||
|
"\n",
|
|||
|
"# 文本预处理 去除一些无用的字符 只提取出中文出来\n",
|
|||
|
"new_data = re.findall('[\\u4e00-\\u9fa5]+', data, re.S)\n",
|
|||
|
"new_data = \" \".join(new_data)\n",
|
|||
|
"\n",
|
|||
|
"# 文本分词\n",
|
|||
|
"seg_list_exact = jieba.cut(new_data, cut_all=True)\n",
|
|||
|
"\n",
|
|||
|
"result_list = []\n",
|
|||
|
"with open('/home/zhangxj/WorkFile/LCA-GPT/QA/hit_stopwords.txt', encoding='utf-8') as f:\n",
|
|||
|
" con = f.readlines()\n",
|
|||
|
" stop_words = set()\n",
|
|||
|
" for i in con:\n",
|
|||
|
" i = i.replace(\"\\n\", \"\") # 去掉读取每一行数据的\\n\n",
|
|||
|
" stop_words.add(i)\n",
|
|||
|
"\n",
|
|||
|
"for word in seg_list_exact:\n",
|
|||
|
" # 设置停用词并去除单个词\n",
|
|||
|
" if word not in stop_words and len(word) > 1:\n",
|
|||
|
" result_list.append(word)\n",
|
|||
|
"print(result_list)\n",
|
|||
|
"\n",
|
|||
|
"# 筛选后统计\n",
|
|||
|
"word_counts = collections.Counter(result_list)\n",
|
|||
|
"# 获取前100最高频的词\n",
|
|||
|
"word_counts_top100 = word_counts.most_common(100)\n",
|
|||
|
"# print(word_counts_top100)\n",
|
|||
|
"\n",
|
|||
|
"# 绘制词云\n",
|
|||
|
"my_cloud = WordCloud(\n",
|
|||
|
" background_color='white', # 设置背景颜色 默认是black\n",
|
|||
|
" width=900, height=600,\n",
|
|||
|
" max_words=100, # 词云显示的最大词语数量\n",
|
|||
|
" font_path='/home/zhangxj/.fonts/楷体_GB2312.TTF',\n",
|
|||
|
" max_font_size=99, # 设置字体最大值\n",
|
|||
|
" min_font_size=16, # 设置子图最小值\n",
|
|||
|
" random_state=35 # 设置随机生成状态,即多少种配色方案\n",
|
|||
|
").generate_from_frequencies(word_counts)\n",
|
|||
|
"\n",
|
|||
|
"# 显示生成的词云图片\n",
|
|||
|
"plt.imshow(my_cloud, interpolation='bilinear')\n",
|
|||
|
"# 显示设置词云图中无坐标轴\n",
|
|||
|
"plt.axis('off')\n",
|
|||
|
"plt.savefig(\"wordcloud.png\",format='png')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Qwen",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.10.14"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|