Commit 194e12f2 by 陈正乐

由于langchain版本更新,包位置更新

parent 45c4a373
import os, copy
from langchain.document_loaders import UnstructuredFileLoader, TextLoader, CSVLoader, UnstructuredPDFLoader, \
from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader, CSVLoader, UnstructuredPDFLoader, \
UnstructuredWordDocumentLoader, PDFMinerPDFasHTMLLoader
from .config import SENTENCE_SIZE, ZH_TITLE_ENHANCE
from .chinese_text_splitter import ChineseTextSplitter
from .zh_title_enhance import zh_title_enhance
from langchain.schema import Document
from typing import List, Dict, Optional
from typing import List
from src.loader.callback import BaseCallback
import re
from bs4 import BeautifulSoup
......
......@@ -6,7 +6,7 @@ from os import path
import copy
from typing import List, OrderedDict, Any, Optional, Tuple, Dict
from src.pgdb.knowledge.pgsqldocstore import InMemorySecondaryDocstore
from langchain.vectorstores.faiss import FAISS, dependable_faiss_import
from langchain.vectorstores.faiss import FAISS
from langchain.schema import Document
from src.pgdb.knowledge.pgsqldocstore import PgSqlDocstore
from langchain.embeddings.huggingface import (
......@@ -74,56 +74,56 @@ class RE_FAISS(FAISS):
deduplicated_dict.items()]
return deduplicated_documents
def similarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
faiss = dependable_faiss_import()
vector = np.array([embedding], dtype=np.float32)
if self._normalize_L2:
faiss.normalize_L2(vector)
scores, indices = self.index.search(vector, k if filter is None else fetch_k)
docs = []
for j, i in enumerate(indices[0]):
if i == -1:
# This happens when not enough docs are returned.
continue
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
if filter is not None:
filter = {
key: [value] if not isinstance(value, list) else value
for key, value in filter.items()
}
if all(doc.metadata.get(key) in value for key, value in filter.items()):
docs.append((doc, scores[0][j]))
else:
docs.append((doc, scores[0][j]))
docs = self._tuple_deduplication(docs)
score_threshold = kwargs.get("score_threshold")
if score_threshold is not None:
cmp = (
operator.ge
if self.distance_strategy
in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
else operator.le
)
docs = [
(doc, similarity)
for doc, similarity in docs
if cmp(similarity, score_threshold)
]
if "doc_callback" in kwargs:
if hasattr(kwargs["doc_callback"], 'after_search'):
docs = kwargs["doc_callback"].after_search(self.docstore, docs, number=k)
return docs[:k]
# def similarity_search_with_score_by_vector(
# self,
# embedding: List[float],
# k: int = 4,
# filter: Optional[Dict[str, Any]] = None,
# fetch_k: int = 20,
# **kwargs: Any,
# ) -> List[Tuple[Document, float]]:
# faiss = dependable_faiss_import()
# vector = np.array([embedding], dtype=np.float32)
# if self._normalize_L2:
# faiss.normalize_L2(vector)
# scores, indices = self.index.search(vector, k if filter is None else fetch_k)
# docs = []
# for j, i in enumerate(indices[0]):
# if i == -1:
# # This happens when not enough docs are returned.
# continue
# _id = self.index_to_docstore_id[i]
# doc = self.docstore.search(_id)
# if not isinstance(doc, Document):
# raise ValueError(f"Could not find document for id {_id}, got {doc}")
# if filter is not None:
# filter = {
# key: [value] if not isinstance(value, list) else value
# for key, value in filter.items()
# }
# if all(doc.metadata.get(key) in value for key, value in filter.items()):
# docs.append((doc, scores[0][j]))
# else:
# docs.append((doc, scores[0][j]))
# docs = self._tuple_deduplication(docs)
# score_threshold = kwargs.get("score_threshold")
# if score_threshold is not None:
# cmp = (
# operator.ge
# if self.distance_strategy
# in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
# else operator.le
# )
# docs = [
# (doc, similarity)
# for doc, similarity in docs
# if cmp(similarity, score_threshold)
# ]
#
# if "doc_callback" in kwargs:
# if hasattr(kwargs["doc_callback"], 'after_search'):
# docs = kwargs["doc_callback"].after_search(self.docstore, docs, number=k)
# return docs[:k]
def max_marginal_relevance_search_by_vector(
self,
......@@ -180,7 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id={})
else:
print("load_local faiss")
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings)
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore
_faiss.docstore = docstore1
return _faiss
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment