import os from langchain_core.messages import HumanMessage from langchain_openai import ChatOpenAI from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Milvus import json
def setup_llm_environment(): os.environ["OPENAI_API_KEY"] = "None" os.environ["OPENAI_API_BASE"] = "http://10.58.0.2:8000/v1" return ChatOpenAI(model_name = "Qwen2.5-14B")
def init_embedding_model(): return HuggingFaceEmbeddings(model_name = "E:/all-MiniLM-L12-v2")
def init_vector_database(embedding): return Milvus(embedding_function=embedding, collection_name="arXiv_Back", connection_args = {"host": "10.58.0.2", "port": "19530"})
def retrieve_relevant_abstracts(db, question: str, top_k: int = 10): for _ in range(3): search_results = db.similarity_search(question, k=top_k) if search_results: return search_results return []
def generate_response(question: str, abstracts: list): abstracts_info = [ f"access_id:{a.metadata['access_id']},authors:{a.metadata['authors']},abstract:{a.page_content},title:{a.metadata['title']}" for a in abstracts ] prompt_template = """ 用户提问: {question} 以下是与问题最相关的论文摘要和信息: {abstracts} 请结合上面的摘要回答用户的问题,并引用相关的文献,格式示例如下: "(对问题中的概念做出解释), ***认为(文献的观点)[1], ***认为(文献的观点)[2]。 (***代表作者或作者们的名字,或摘要中提及的人名,请务必替换为具体的人名,在回答中不要出现***) [1](title),(authors),论文的详情页:https://arxiv.org/abs/(access_id) 论文的pdf地址:https://arxiv.org/pdf/(access_id) [2]... 若文献与问题无关,按自身知识理解回答,无需提及参考文献信息。 下面是一个例子: “大语言模型(Large Language Models,LLMs)是指一类在大量文本数据上训练的深度学习模型,它们能够生成与训练数据相类似的文本,并且能够完成诸如语言翻译、文本生成、问答等多种任务。这些模型因其复杂性和潜在的风险,例如生成有害或误导性的内容,成为了当前研究中的一个重要课题。 Paul Rottger等认为大语言模型的安全性,包括防止生成偏见和有害内容,是当前研究中的一个重要方向[1]。Zishan Guo等认为大语言模型的评估应该包括知识和能力评估、对齐评估和安全评估三个方面,以确保模型的使用是安全和有益的[2]。 依据的参考文献如下: 依据的参考文献如下: [1] SafetyPrompts: a Systematic Review of Open Datasets for Evaluating and Improving Large Language Model Safety, Paul Rottger, Fabio Pernisi, Bertie Vidgen, Dirk Hovy,论文的详情页:https://arxiv.org/abs/2404.05399 论文的pdf地址:https://arxiv.org/pdf/2404.05399 [2] Evaluating Large Language Models: A Comprehensive Survey, Zishan Guo, Renren Jin, Chuang Liu, Yufei Huang, Dan Shi, Supryadi, Linhao Yu, Yan Liu, Jiaxuan Li, Bojian Xiong, Deyi Xiong,论文的详情页:https://arxiv.org/abs/2310.19736 论文的pdf地址:https://arxiv.org/pdf/2310.19736” """ prompt = prompt_template.format(question=question, abstracts="\n".join(abstracts_info)) llm_chat = ChatOpenAI(model_name="Qwen2.5-14B") response = llm_chat.generate([prompt]) return response.generations[0][0].text
def optimize_question_statement(question): llm = ChatOpenAI(model_name="Qwen2.5-14B") question_milvus_template = ( "用户提问{question} 请将其进行关键词拆分并且优化问题语句并以英文回答,例如:什么是软件工程?,回答 what is software engineering safe? keywords include software engineering,safe. 可适度扩展1-2个常用概念关键词,按概念常用频率从高到低排序" ) query = llm.generate([question_milvus_template.format(question=question)]) return query.generations[0][0].text
def handle_single_question(question: str, db): query = optimize_question_statement(question) abstracts = retrieve_relevant_abstracts(db, query) result = generate_response(question, abstracts) if abstracts else "没有搜索到相关文献,请重新提问" print("问题答案:", result) print("问题答案已成功写入") return result
def process_questions(questions_file, output_file): try: with open(questions_file, 'r', encoding='utf-8') as f: questions = json.load(f) except FileNotFoundError: print(f"问题文件 {questions_file} 未找到。") return except json.JSONDecodeError: print(f"问题文件 {questions_file} 格式错误,无法解析。") return
llm_chat_model = setup_llm_environment() embedding_model = init_embedding_model() vector_db = init_vector_database(embedding_model)
for item in questions: question = item.get('question', '') if not question: print(f"跳过没有问题内容的项: {item}") continue print(f"回答问题: {question}") item['answer'] = handle_single_question(question, vector_db)
try: with open(output_file, 'w', encoding='utf-8') as f: json.dump(questions, f, ensure_ascii=False, indent=4) print(f"问题答案已成功写入 {output_file}.") except Exception as e: print(f"写入文件 {output_file} 时发生错误: {e}")
if __name__ == "__main__": questions_file = './questions.jsonl' answers_file = 'answer.json' process_questions(questions_file, answers_file)
|