加载外部文件增强大模型
#安装必须的包
pip install docx2txt
pip install pypdf
pip install nltk
## 加载doc文档
from langchain.document_loaders import Docx2txtLoader
#定义chatdoc
class ChatDoc():
def fgetFilel):
#读取文件
loader=Docx2txtLoader('example/fake.docx")
text = loader.load()
return text
ChatDoc.getFile()
## 加载pdf
from langchain.document_loaders import PyPDFLoader
#定义chatdoc
class ChatDoc():
def getFile():
try:
#读取文件
Loader= PyPDFLoader('example/fake.pdf)
text = loader.load()
return text
except Exception as e:
print(f"Error loading files: (e)")
ChatDoc.getFile()
## 动态加载不同类型文件
from langchain.document_loaders import Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader
#定义chatdoc
class ChatDoc():
def __init__(self):
self.doc = None
self.splitText = []
def getFile(self):
doc = self.doc
loaders ={
"docx":Docx2txtLoader,
"pdf":PyPDFLoader,
"xlsx":UnstructuredExcelLoader,
}
file_extension doc.split(".")[-1]
loader_class loaders.get(file_extension)
if loader_class:
try:
loaderloader_class(doc)
text loader.load()
return text
except Exception as e:
print(f"Error loading{file extension}files:{e)")
else:
print(f"Unsupporyed file extension:{file_extension)")
chat_doc ChatDoc()
chat_doc.doc "example/fake.pdf"
chat_doc.getFile()
from langchain.text splitter import CharacterTextsplitter
#处理文档的函数
def splitsentences(self):
fuLL_text=seLf.getFile()#获取文档内容
if full_text != None:
#对文档进行分割
text_splitter CharacterTextSplitter(chunk_size=100,chunk_overlap=20)
texts = text_splitter.split_documents(full_text)
self.splitText = texts
chat_doc = ChatDoc()
chat_doc.doc = "example/fake.docx"
chat_doc.splitSentences()
print(chat_doc.splitText)
from Langchain.embeddings import OpenAIEmbeddings
from Langchain.vectorstores import Chroma
#向量化与向量存储
def embeddingAndVectorDB(self):
emmbeddings OpenAIEmbeddings()
db Chroma.from documents(documents=self.splitText,emmbedding=emmbeddings)
return db
chat_doc.embeddingAndvectorDB()
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
retriever = db.as_retriever()
results = retriever.invoke(question)
return results
chat_doc.askAndFindFiles("这家公司l叫什么名字?")
### 多角度提问
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
#把问题交给LLM进行多角度扩展
Llm = ChatopenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
retriever=db.as_retriever(),
1lm=11m,
)
return retriever_from_llm.get_relevant_documents(question)
#设置下Logging查看生成查询
import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger("Langchain.retrievers.multi_query").setLevel(logging.DEBUG)
unique_doc=chat_doc.askAndFindFiles("公司名称是什么?")
print(unique_doc)
#引入上下文压缩相关包
from langchain.Ilms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
retriever = db.as_retriever()
llm = OpenAI(
temperature=0,
)
compressor = LLMChainExtractor.from_llm(llm=llm)
compressor_retriever = ContextualCompressionRetriever(
base_retriever=retriever,
base_compressor=compressor,
)
return compressor_retriever.get_relevant_documentsquery=question)
#设置下logging查看生成查询
import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.DEBUG)
unique_doc=chat_doc.askAndFindFiles("这间公司的负债有多少?)
print(unique_doc)
print(len(unique_doc))
### mmr和相似性打分
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db=self.embeddingAndVectorDB()
#retriever=db.as_retriever(search_type="mmr")
retriever =db.as_retriever(search_type="similarity_score_threshold",search_kwargs=
("score_threshold":.5,'k':1})
return retriever.get_relevant_documents(query=question)
#导入聊天所需要的模块
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
def __init__(self):
self.doc = None
seLf.splitText= []#分割后的文本
self.template = [
("System“,"你是一个处理文档的秘书,你从不说自己是一个大模型或AI助手,你会根据下面提供的上下文内容来继续回答问题,\n上下文内容:\n(context)\n"),
("human","你好!“
("ai","你好!"),
("human","{question}")
]
self.prompt = ChatPromptTemplate.from_messages(self.template)
#用自然语言和文档聊天
def chatWithDoc(self,question):
_content=""
context self.askAndFindFiles(question)
for i in context:
_content +=i.page_content
messages self.prompt.format_messages(context=_content,question=question)
chat ChatopenAI(
model="gpt-4",
temperature=0
)
return chat.invoke(messages)
chat_doc ChatDoc()
chat_doc.doc "example/fake.docx"
chat_doc.splitSentences()
chat_doc.chatwithDoc("公司注册地址是哪里?)