Commit 900878e7 by 陈正乐

更改文本分割方式

parent e9834104
...@@ -18,3 +18,4 @@ mdtex2html==1.2.0 ...@@ -18,3 +18,4 @@ mdtex2html==1.2.0
faiss-cpu==1.7.2 # https://github.com/facebookresearch/faiss/blob/main/INSTALL.md faiss-cpu==1.7.2 # https://github.com/facebookresearch/faiss/blob/main/INSTALL.md
gradio==3.48.0 gradio==3.48.0
qianfan==0.3.13.1 qianfan==0.3.13.1
modelscope==1.14.0
import os import os
import sys import sys
import re
from os import path from os import path
import copy import copy
...@@ -13,6 +14,7 @@ from langchain.embeddings.huggingface import ( ...@@ -13,6 +14,7 @@ from langchain.embeddings.huggingface import (
) )
import math import math
import faiss import faiss
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.base import VectorStoreRetriever from langchain.vectorstores.base import VectorStoreRetriever
from langchain.callbacks.manager import ( from langchain.callbacks.manager import (
AsyncCallbackManagerForRetrieverRun, AsyncCallbackManagerForRetrieverRun,
...@@ -21,9 +23,9 @@ from langchain.callbacks.manager import ( ...@@ -21,9 +23,9 @@ from langchain.callbacks.manager import (
from src.loader import load from src.loader import load
from langchain.embeddings.base import Embeddings from langchain.embeddings.base import Embeddings
from src.pgdb.knowledge.callback import DocumentCallback, DefaultDocumentCallback from src.pgdb.knowledge.callback import DocumentCallback, DefaultDocumentCallback
from modelscope.pipelines import pipeline import operator
from modelscope.utils.constant import Tasks from langchain.vectorstores.utils import DistanceStrategy
from src.config.consts import NLP_BERT_PATH import numpy as np
sys.path.append("../") sys.path.append("../")
...@@ -53,6 +55,9 @@ def get_embding(_path: str) -> Embeddings: ...@@ -53,6 +55,9 @@ def get_embding(_path: str) -> Embeddings:
return EmbeddingFactory(_path).get_embedding() return EmbeddingFactory(_path).get_embedding()
class RE_FAISS(FAISS): class RE_FAISS(FAISS):
# 去重,并保留metadate # 去重,并保留metadate
@staticmethod @staticmethod
...@@ -175,8 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde ...@@ -175,8 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id={}) index_to_docstore_id={})
else: else:
print("load_local faiss") print("load_local faiss")
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, _faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
allow_dangerous_deserialization=True)
if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore
_faiss.docstore = docstore1 _faiss.docstore = docstore1
return _faiss return _faiss
...@@ -245,22 +249,16 @@ class VectorStore_FAISS(FAISS): ...@@ -245,22 +249,16 @@ class VectorStore_FAISS(FAISS):
# Document { # Document {
# page_content 段落 # page_content 段落
# metadata { # metadata {
# page 页码 # page 页码
# } # }
# } # }
def _add_documents(self, new_docs: List[Document], need_split: bool = True, pattern: str = r'[?。;\n]'): def _add_documents(self, new_docs: List[Document], need_split: bool = True, pattern: str = r'[?。;\n]'):
list_of_documents: List[Document] = [] list_of_documents: List[Document] = []
if self.doc_callback: if self.doc_callback:
new_docs = self.doc_callback.before_store(self._faiss.docstore, new_docs) new_docs = self.doc_callback.before_store(self._faiss.docstore, new_docs)
if need_split: if need_split:
p = pipeline(
task=Tasks.document_segmentation,
model=NLP_BERT_PATH,
model_revision='v1.0.1'
)
for doc in new_docs: for doc in new_docs:
# words_list = re.split(pattern, doc.page_content) words_list = re.split(pattern, doc.page_content)
words_list = p(documents=doc.page_content)['text'].split('\n\t')
# 去掉重复项 # 去掉重复项
words_list = set(words_list) words_list = set(words_list)
words_list = [str(words) for words in words_list] words_list = [str(words) for words in words_list]
...@@ -271,9 +269,6 @@ class VectorStore_FAISS(FAISS): ...@@ -271,9 +269,6 @@ class VectorStore_FAISS(FAISS):
list_of_documents.append(Document(page_content=words, metadata=metadata)) list_of_documents.append(Document(page_content=words, metadata=metadata))
else: else:
list_of_documents = new_docs list_of_documents = new_docs
print("====================================================================================")
print(list_of_documents)
print("====================================================================================")
self._faiss.add_documents(list_of_documents) self._faiss.add_documents(list_of_documents)
def _add_documents_from_dir(self, filepaths=None, load_kwargs=None): def _add_documents_from_dir(self, filepaths=None, load_kwargs=None):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment