使用ollama框架调用大模型时,直接加载即可使用,不需要添加tokenizer的处理,因为其自动添加对输入的tokenizer,但在使用transformers包加载原始模型时,需要对其做处理,添加tokenizer处理,否则报错:TypeError: embedding(): argument ‘indices‘ (position 2) must be Tensor, not ChatPromptValue。
在构建chain的时候,需要使用runnable处理qwen_generate函数,才能正常运行。
import torch from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from langchain_core.runnables import RunnableLambda device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') MODEL_PATH = "D:\\ModelScopeModels\\QWEN\\QWEN-2.5" tokenizer = AutoTokenizer.from_pretrained( MODEL_PATH, trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) # 生成配置(可按需调) gen_config = GenerationConfig( max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) # 自定义 Runnable:把 ChatPromptValue 变成字符串 → tokenize → generate → decode def qwen_generate(chat_value) -> str: prompt_str = chat_value.to_string() # 拿到纯文本 inputs = tokenizer(prompt_str, return_tensors="pt").to(device) with torch.no_grad(): output_ids = model.generate(**inputs, generation_config=gen_config) # 只保留新生成的 token new_ids = output_ids[0][len(inputs.input_ids[0]):] return tokenizer.decode(new_ids, skip_special_tokens=True).strip() prompt = "如实回答问题,不要编造答案, 问题: {question}" prompt_template = ChatPromptTemplate.from_template(prompt) chain = prompt_template | RunnableLambda(qwen_generate) | StrOutputParser() if __name__ == "__main__": while True: q = input("User: ") if q.lower() in {"quit", "exit"}: break response = chain.invoke({"question": q}) print("Assistant:", response)