Commit 900878e7 by 陈正乐

更改文本分割方式

parent e9834104
......@@ -18,3 +18,4 @@ mdtex2html==1.2.0
faiss-cpu==1.7.2 # https://github.com/facebookresearch/faiss/blob/main/INSTALL.md
gradio==3.48.0
qianfan==0.3.13.1
modelscope==1.14.0
import os
import sys
import re
from os import path
import copy
......@@ -13,6 +14,7 @@ from langchain.embeddings.huggingface import (
)
import math
import faiss
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.base import VectorStoreRetriever
from langchain.callbacks.manager import (
AsyncCallbackManagerForRetrieverRun,
......@@ -21,9 +23,9 @@ from langchain.callbacks.manager import (
from src.loader import load
from langchain.embeddings.base import Embeddings
from src.pgdb.knowledge.callback import DocumentCallback, DefaultDocumentCallback
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from src.config.consts import NLP_BERT_PATH
import operator
from langchain.vectorstores.utils import DistanceStrategy
import numpy as np
sys.path.append("../")
......@@ -53,6 +55,9 @@ def get_embding(_path: str) -> Embeddings:
return EmbeddingFactory(_path).get_embedding()
class RE_FAISS(FAISS):
# 去重,并保留metadate
@staticmethod
......@@ -175,8 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id={})
else:
print("load_local faiss")
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings,
allow_dangerous_deserialization=True)
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore
_faiss.docstore = docstore1
return _faiss
......@@ -245,22 +249,16 @@ class VectorStore_FAISS(FAISS):
# Document {
# page_content 段落
# metadata {
# page 页码
# }
# page 页码
# }
# }
def _add_documents(self, new_docs: List[Document], need_split: bool = True, pattern: str = r'[?。;\n]'):
list_of_documents: List[Document] = []
if self.doc_callback:
new_docs = self.doc_callback.before_store(self._faiss.docstore, new_docs)
if need_split:
p = pipeline(
task=Tasks.document_segmentation,
model=NLP_BERT_PATH,
model_revision='v1.0.1'
)
for doc in new_docs:
# words_list = re.split(pattern, doc.page_content)
words_list = p(documents=doc.page_content)['text'].split('\n\t')
words_list = re.split(pattern, doc.page_content)
# 去掉重复项
words_list = set(words_list)
words_list = [str(words) for words in words_list]
......@@ -271,9 +269,6 @@ class VectorStore_FAISS(FAISS):
list_of_documents.append(Document(page_content=words, metadata=metadata))
else:
list_of_documents = new_docs
print("====================================================================================")
print(list_of_documents)
print("====================================================================================")
self._faiss.add_documents(list_of_documents)
def _add_documents_from_dir(self, filepaths=None, load_kwargs=None):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment