Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
L
LAE
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
文靖昊
LAE
Commits
194e12f2
Commit
194e12f2
authored
a year ago
by
陈正乐
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
由于langchain版本更新,包位置更新
parent
45c4a373
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
54 additions
and
54 deletions
+54
-54
load.py
src/loader/load.py
+2
-2
similarity.py
src/pgdb/knowledge/similarity.py
+52
-52
No files found.
src/loader/load.py
View file @
194e12f2
import
os
,
copy
from
langchain.document_loaders
import
UnstructuredFileLoader
,
TextLoader
,
CSVLoader
,
UnstructuredPDFLoader
,
\
from
langchain
_community
.document_loaders
import
UnstructuredFileLoader
,
TextLoader
,
CSVLoader
,
UnstructuredPDFLoader
,
\
UnstructuredWordDocumentLoader
,
PDFMinerPDFasHTMLLoader
from
.config
import
SENTENCE_SIZE
,
ZH_TITLE_ENHANCE
from
.chinese_text_splitter
import
ChineseTextSplitter
from
.zh_title_enhance
import
zh_title_enhance
from
langchain.schema
import
Document
from
typing
import
List
,
Dict
,
Optional
from
typing
import
List
from
src.loader.callback
import
BaseCallback
import
re
from
bs4
import
BeautifulSoup
...
...
This diff is collapsed.
Click to expand it.
src/pgdb/knowledge/similarity.py
View file @
194e12f2
...
...
@@ -6,7 +6,7 @@ from os import path
import
copy
from
typing
import
List
,
OrderedDict
,
Any
,
Optional
,
Tuple
,
Dict
from
src.pgdb.knowledge.pgsqldocstore
import
InMemorySecondaryDocstore
from
langchain.vectorstores.faiss
import
FAISS
,
dependable_faiss_import
from
langchain.vectorstores.faiss
import
FAISS
from
langchain.schema
import
Document
from
src.pgdb.knowledge.pgsqldocstore
import
PgSqlDocstore
from
langchain.embeddings.huggingface
import
(
...
...
@@ -74,56 +74,56 @@ class RE_FAISS(FAISS):
deduplicated_dict
.
items
()]
return
deduplicated_documents
def
similarity_search_with_score_by_vector
(
self
,
embedding
:
List
[
float
],
k
:
int
=
4
,
filter
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
fetch_k
:
int
=
20
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
Document
,
float
]]:
faiss
=
dependable_faiss_import
()
vector
=
np
.
array
([
embedding
],
dtype
=
np
.
float32
)
if
self
.
_normalize_L2
:
faiss
.
normalize_L2
(
vector
)
scores
,
indices
=
self
.
index
.
search
(
vector
,
k
if
filter
is
None
else
fetch_k
)
docs
=
[]
for
j
,
i
in
enumerate
(
indices
[
0
]):
if
i
==
-
1
:
# This happens when not enough docs are returned.
continue
_id
=
self
.
index_to_docstore_id
[
i
]
doc
=
self
.
docstore
.
search
(
_id
)
if
not
isinstance
(
doc
,
Document
):
raise
ValueError
(
f
"Could not find document for id {_id}, got {doc}"
)
if
filter
is
not
None
:
filter
=
{
key
:
[
value
]
if
not
isinstance
(
value
,
list
)
else
value
for
key
,
value
in
filter
.
items
()
}
if
all
(
doc
.
metadata
.
get
(
key
)
in
value
for
key
,
value
in
filter
.
items
()):
docs
.
append
((
doc
,
scores
[
0
][
j
]))
else
:
docs
.
append
((
doc
,
scores
[
0
][
j
]))
docs
=
self
.
_tuple_deduplication
(
docs
)
score_threshold
=
kwargs
.
get
(
"score_threshold"
)
if
score_threshold
is
not
None
:
cmp
=
(
operator
.
ge
if
self
.
distance_strategy
in
(
DistanceStrategy
.
MAX_INNER_PRODUCT
,
DistanceStrategy
.
JACCARD
)
else
operator
.
le
)
docs
=
[
(
doc
,
similarity
)
for
doc
,
similarity
in
docs
if
cmp
(
similarity
,
score_threshold
)
]
if
"doc_callback"
in
kwargs
:
if
hasattr
(
kwargs
[
"doc_callback"
],
'after_search'
):
docs
=
kwargs
[
"doc_callback"
]
.
after_search
(
self
.
docstore
,
docs
,
number
=
k
)
return
docs
[:
k
]
#
def similarity_search_with_score_by_vector(
#
self,
#
embedding: List[float],
#
k: int = 4,
#
filter: Optional[Dict[str, Any]] = None,
#
fetch_k: int = 20,
#
**kwargs: Any,
#
) -> List[Tuple[Document, float]]:
#
faiss = dependable_faiss_import()
#
vector = np.array([embedding], dtype=np.float32)
#
if self._normalize_L2:
#
faiss.normalize_L2(vector)
#
scores, indices = self.index.search(vector, k if filter is None else fetch_k)
#
docs = []
#
for j, i in enumerate(indices[0]):
#
if i == -1:
#
# This happens when not enough docs are returned.
#
continue
#
_id = self.index_to_docstore_id[i]
#
doc = self.docstore.search(_id)
#
if not isinstance(doc, Document):
#
raise ValueError(f"Could not find document for id {_id}, got {doc}")
#
if filter is not None:
#
filter = {
#
key: [value] if not isinstance(value, list) else value
#
for key, value in filter.items()
#
}
#
if all(doc.metadata.get(key) in value for key, value in filter.items()):
#
docs.append((doc, scores[0][j]))
#
else:
#
docs.append((doc, scores[0][j]))
#
docs = self._tuple_deduplication(docs)
#
score_threshold = kwargs.get("score_threshold")
#
if score_threshold is not None:
#
cmp = (
#
operator.ge
#
if self.distance_strategy
#
in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
#
else operator.le
#
)
#
docs = [
#
(doc, similarity)
#
for doc, similarity in docs
#
if cmp(similarity, score_threshold)
#
]
#
#
if "doc_callback" in kwargs:
#
if hasattr(kwargs["doc_callback"], 'after_search'):
#
docs = kwargs["doc_callback"].after_search(self.docstore, docs, number=k)
#
return docs[:k]
def
max_marginal_relevance_search_by_vector
(
self
,
...
...
@@ -180,7 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id
=
{})
else
:
print
(
"load_local faiss"
)
_faiss
=
RE_FAISS
.
load_local
(
folder_path
=
store_path
,
index_name
=
index_name
,
embeddings
=
embeddings
)
_faiss
=
RE_FAISS
.
load_local
(
folder_path
=
store_path
,
index_name
=
index_name
,
embeddings
=
embeddings
,
allow_dangerous_deserialization
=
True
)
if
docstore1
and
is_pgsql
:
# 如果外部参数调整,更新docstore
_faiss
.
docstore
=
docstore1
return
_faiss
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment