Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
L
LAE
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
文靖昊
LAE
Commits
194e12f2
Commit
194e12f2
authored
Apr 29, 2024
by
陈正乐
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
由于langchain版本更新,包位置更新
parent
45c4a373
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
54 additions
and
54 deletions
+54
-54
load.py
src/loader/load.py
+2
-2
similarity.py
src/pgdb/knowledge/similarity.py
+52
-52
No files found.
src/loader/load.py
View file @
194e12f2
import
os
,
copy
import
os
,
copy
from
langchain.document_loaders
import
UnstructuredFileLoader
,
TextLoader
,
CSVLoader
,
UnstructuredPDFLoader
,
\
from
langchain
_community
.document_loaders
import
UnstructuredFileLoader
,
TextLoader
,
CSVLoader
,
UnstructuredPDFLoader
,
\
UnstructuredWordDocumentLoader
,
PDFMinerPDFasHTMLLoader
UnstructuredWordDocumentLoader
,
PDFMinerPDFasHTMLLoader
from
.config
import
SENTENCE_SIZE
,
ZH_TITLE_ENHANCE
from
.config
import
SENTENCE_SIZE
,
ZH_TITLE_ENHANCE
from
.chinese_text_splitter
import
ChineseTextSplitter
from
.chinese_text_splitter
import
ChineseTextSplitter
from
.zh_title_enhance
import
zh_title_enhance
from
.zh_title_enhance
import
zh_title_enhance
from
langchain.schema
import
Document
from
langchain.schema
import
Document
from
typing
import
List
,
Dict
,
Optional
from
typing
import
List
from
src.loader.callback
import
BaseCallback
from
src.loader.callback
import
BaseCallback
import
re
import
re
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
...
...
src/pgdb/knowledge/similarity.py
View file @
194e12f2
...
@@ -6,7 +6,7 @@ from os import path
...
@@ -6,7 +6,7 @@ from os import path
import
copy
import
copy
from
typing
import
List
,
OrderedDict
,
Any
,
Optional
,
Tuple
,
Dict
from
typing
import
List
,
OrderedDict
,
Any
,
Optional
,
Tuple
,
Dict
from
src.pgdb.knowledge.pgsqldocstore
import
InMemorySecondaryDocstore
from
src.pgdb.knowledge.pgsqldocstore
import
InMemorySecondaryDocstore
from
langchain.vectorstores.faiss
import
FAISS
,
dependable_faiss_import
from
langchain.vectorstores.faiss
import
FAISS
from
langchain.schema
import
Document
from
langchain.schema
import
Document
from
src.pgdb.knowledge.pgsqldocstore
import
PgSqlDocstore
from
src.pgdb.knowledge.pgsqldocstore
import
PgSqlDocstore
from
langchain.embeddings.huggingface
import
(
from
langchain.embeddings.huggingface
import
(
...
@@ -74,56 +74,56 @@ class RE_FAISS(FAISS):
...
@@ -74,56 +74,56 @@ class RE_FAISS(FAISS):
deduplicated_dict
.
items
()]
deduplicated_dict
.
items
()]
return
deduplicated_documents
return
deduplicated_documents
def
similarity_search_with_score_by_vector
(
#
def similarity_search_with_score_by_vector(
self
,
#
self,
embedding
:
List
[
float
],
#
embedding: List[float],
k
:
int
=
4
,
#
k: int = 4,
filter
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
#
filter: Optional[Dict[str, Any]] = None,
fetch_k
:
int
=
20
,
#
fetch_k: int = 20,
**
kwargs
:
Any
,
#
**kwargs: Any,
)
->
List
[
Tuple
[
Document
,
float
]]:
#
) -> List[Tuple[Document, float]]:
faiss
=
dependable_faiss_import
()
#
faiss = dependable_faiss_import()
vector
=
np
.
array
([
embedding
],
dtype
=
np
.
float32
)
#
vector = np.array([embedding], dtype=np.float32)
if
self
.
_normalize_L2
:
#
if self._normalize_L2:
faiss
.
normalize_L2
(
vector
)
#
faiss.normalize_L2(vector)
scores
,
indices
=
self
.
index
.
search
(
vector
,
k
if
filter
is
None
else
fetch_k
)
#
scores, indices = self.index.search(vector, k if filter is None else fetch_k)
docs
=
[]
#
docs = []
for
j
,
i
in
enumerate
(
indices
[
0
]):
#
for j, i in enumerate(indices[0]):
if
i
==
-
1
:
#
if i == -1:
# This happens when not enough docs are returned.
#
# This happens when not enough docs are returned.
continue
#
continue
_id
=
self
.
index_to_docstore_id
[
i
]
#
_id = self.index_to_docstore_id[i]
doc
=
self
.
docstore
.
search
(
_id
)
#
doc = self.docstore.search(_id)
if
not
isinstance
(
doc
,
Document
):
#
if not isinstance(doc, Document):
raise
ValueError
(
f
"Could not find document for id {_id}, got {doc}"
)
#
raise ValueError(f"Could not find document for id {_id}, got {doc}")
if
filter
is
not
None
:
#
if filter is not None:
filter
=
{
#
filter = {
key
:
[
value
]
if
not
isinstance
(
value
,
list
)
else
value
#
key: [value] if not isinstance(value, list) else value
for
key
,
value
in
filter
.
items
()
#
for key, value in filter.items()
}
#
}
if
all
(
doc
.
metadata
.
get
(
key
)
in
value
for
key
,
value
in
filter
.
items
()):
#
if all(doc.metadata.get(key) in value for key, value in filter.items()):
docs
.
append
((
doc
,
scores
[
0
][
j
]))
#
docs.append((doc, scores[0][j]))
else
:
#
else:
docs
.
append
((
doc
,
scores
[
0
][
j
]))
#
docs.append((doc, scores[0][j]))
docs
=
self
.
_tuple_deduplication
(
docs
)
#
docs = self._tuple_deduplication(docs)
score_threshold
=
kwargs
.
get
(
"score_threshold"
)
#
score_threshold = kwargs.get("score_threshold")
if
score_threshold
is
not
None
:
#
if score_threshold is not None:
cmp
=
(
#
cmp = (
operator
.
ge
#
operator.ge
if
self
.
distance_strategy
#
if self.distance_strategy
in
(
DistanceStrategy
.
MAX_INNER_PRODUCT
,
DistanceStrategy
.
JACCARD
)
#
in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
else
operator
.
le
#
else operator.le
)
#
)
docs
=
[
#
docs = [
(
doc
,
similarity
)
#
(doc, similarity)
for
doc
,
similarity
in
docs
#
for doc, similarity in docs
if
cmp
(
similarity
,
score_threshold
)
#
if cmp(similarity, score_threshold)
]
#
]
#
if
"doc_callback"
in
kwargs
:
#
if "doc_callback" in kwargs:
if
hasattr
(
kwargs
[
"doc_callback"
],
'after_search'
):
#
if hasattr(kwargs["doc_callback"], 'after_search'):
docs
=
kwargs
[
"doc_callback"
]
.
after_search
(
self
.
docstore
,
docs
,
number
=
k
)
#
docs = kwargs["doc_callback"].after_search(self.docstore, docs, number=k)
return
docs
[:
k
]
#
return docs[:k]
def
max_marginal_relevance_search_by_vector
(
def
max_marginal_relevance_search_by_vector
(
self
,
self
,
...
@@ -180,7 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
...
@@ -180,7 +180,7 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id
=
{})
index_to_docstore_id
=
{})
else
:
else
:
print
(
"load_local faiss"
)
print
(
"load_local faiss"
)
_faiss
=
RE_FAISS
.
load_local
(
folder_path
=
store_path
,
index_name
=
index_name
,
embeddings
=
embeddings
)
_faiss
=
RE_FAISS
.
load_local
(
folder_path
=
store_path
,
index_name
=
index_name
,
embeddings
=
embeddings
,
allow_dangerous_deserialization
=
True
)
if
docstore1
and
is_pgsql
:
# 如果外部参数调整,更新docstore
if
docstore1
and
is_pgsql
:
# 如果外部参数调整,更新docstore
_faiss
.
docstore
=
docstore1
_faiss
.
docstore
=
docstore1
return
_faiss
return
_faiss
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment