Commit 194e12f2 by 陈正乐

由于langchain版本更新,包位置更新

parent 45c4a373
import os, copy import os, copy
from langchain.document_loaders import UnstructuredFileLoader, TextLoader, CSVLoader, UnstructuredPDFLoader, \ from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader, CSVLoader, UnstructuredPDFLoader, \
UnstructuredWordDocumentLoader, PDFMinerPDFasHTMLLoader UnstructuredWordDocumentLoader, PDFMinerPDFasHTMLLoader
from .config import SENTENCE_SIZE, ZH_TITLE_ENHANCE from .config import SENTENCE_SIZE, ZH_TITLE_ENHANCE
from .chinese_text_splitter import ChineseTextSplitter from .chinese_text_splitter import ChineseTextSplitter
from .zh_title_enhance import zh_title_enhance from .zh_title_enhance import zh_title_enhance
from langchain.schema import Document from langchain.schema import Document
from typing import List, Dict, Optional from typing import List
from src.loader.callback import BaseCallback from src.loader.callback import BaseCallback
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
......
...@@ -6,7 +6,7 @@ from os import path ...@@ -6,7 +6,7 @@ from os import path
import copy import copy
from typing import List, OrderedDict, Any, Optional, Tuple, Dict from typing import List, OrderedDict, Any, Optional, Tuple, Dict
from src.pgdb.knowledge.pgsqldocstore import InMemorySecondaryDocstore from src.pgdb.knowledge.pgsqldocstore import InMemorySecondaryDocstore
from langchain.vectorstores.faiss import FAISS, dependable_faiss_import from langchain.vectorstores.faiss import FAISS
from langchain.schema import Document from langchain.schema import Document
from src.pgdb.knowledge.pgsqldocstore import PgSqlDocstore from src.pgdb.knowledge.pgsqldocstore import PgSqlDocstore
from langchain.embeddings.huggingface import ( from langchain.embeddings.huggingface import (
...@@ -74,56 +74,56 @@ class RE_FAISS(FAISS): ...@@ -74,56 +74,56 @@ class RE_FAISS(FAISS):
deduplicated_dict.items()] deduplicated_dict.items()]
return deduplicated_documents return deduplicated_documents
def similarity_search_with_score_by_vector( # def similarity_search_with_score_by_vector(
self, # self,
embedding: List[float], # embedding: List[float],
k: int = 4, # k: int = 4,
filter: Optional[Dict[str, Any]] = None, # filter: Optional[Dict[str, Any]] = None,
fetch_k: int = 20, # fetch_k: int = 20,
**kwargs: Any, # **kwargs: Any,
) -> List[Tuple[Document, float]]: # ) -> List[Tuple[Document, float]]:
faiss = dependable_faiss_import() # faiss = dependable_faiss_import()
vector = np.array([embedding], dtype=np.float32) # vector = np.array([embedding], dtype=np.float32)
if self._normalize_L2: # if self._normalize_L2:
faiss.normalize_L2(vector) # faiss.normalize_L2(vector)
scores, indices = self.index.search(vector, k if filter is None else fetch_k) # scores, indices = self.index.search(vector, k if filter is None else fetch_k)
docs = [] # docs = []
for j, i in enumerate(indices[0]): # for j, i in enumerate(indices[0]):
if i == -1: # if i == -1:
# This happens when not enough docs are returned. # # This happens when not enough docs are returned.
continue # continue
_id = self.index_to_docstore_id[i] # _id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id) # doc = self.docstore.search(_id)
if not isinstance(doc, Document): # if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}") # raise ValueError(f"Could not find document for id {_id}, got {doc}")
if filter is not None: # if filter is not None:
filter = { # filter = {
key: [value] if not isinstance(value, list) else value # key: [value] if not isinstance(value, list) else value
for key, value in filter.items() # for key, value in filter.items()
} # }
if all(doc.metadata.get(key) in value for key, value in filter.items()): # if all(doc.metadata.get(key) in value for key, value in filter.items()):
docs.append((doc, scores[0][j])) # docs.append((doc, scores[0][j]))
else: # else:
docs.append((doc, scores[0][j])) # docs.append((doc, scores[0][j]))
docs = self._tuple_deduplication(docs) # docs = self._tuple_deduplication(docs)
score_threshold = kwargs.get("score_threshold") # score_threshold = kwargs.get("score_threshold")
if score_threshold is not None: # if score_threshold is not None:
cmp = ( # cmp = (
operator.ge # operator.ge
if self.distance_strategy # if self.distance_strategy
in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD) # in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
else operator.le # else operator.le
) # )
docs = [ # docs = [
(doc, similarity) # (doc, similarity)
for doc, similarity in docs # for doc, similarity in docs
if cmp(similarity, score_threshold) # if cmp(similarity, score_threshold)
] # ]
#
if "doc_callback" in kwargs: # if "doc_callback" in kwargs:
if hasattr(kwargs["doc_callback"], 'after_search'): # if hasattr(kwargs["doc_callback"], 'after_search'):
docs = kwargs["doc_callback"].after_search(self.docstore, docs, number=k) # docs = kwargs["doc_callback"].after_search(self.docstore, docs, number=k)
return docs[:k] # return docs[:k]
def max_marginal_relevance_search_by_vector( def max_marginal_relevance_search_by_vector(
self, self,
...@@ -180,7 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde ...@@ -180,7 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id={}) index_to_docstore_id={})
else: else:
print("load_local faiss") print("load_local faiss")
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings) _faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore
_faiss.docstore = docstore1 _faiss.docstore = docstore1
return _faiss return _faiss
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment