Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
L
LAE
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
文靖昊
LAE
Commits
d1dd3af6
Commit
d1dd3af6
authored
a year ago
by
陈正乐
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
调用模型实现文本分割
parent
7891dc93
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
99 additions
and
35 deletions
+99
-35
consts.py
src/config/consts.py
+5
-0
similarity.py
src/pgdb/knowledge/similarity.py
+15
-10
file_load_test.py
test/file_load_test.py
+19
-0
lk_test.py
test/lk_test.py
+60
-25
No files found.
src/config/consts.py
View file @
d1dd3af6
...
...
@@ -57,3 +57,8 @@ prompt1 = """'''
'''
请你根据上述已知资料回答下面的问题,问题如下:
{question}"""
# =============================
# NLP_BERT模型路径配置
# =============================
NLP_BERT_PATH
=
'C:
\\
Users
\\
15663
\\
AI
\\
models
\\
nlp_bert_document-segmentation_chinese-base'
This diff is collapsed.
Click to expand it.
src/pgdb/knowledge/similarity.py
View file @
d1dd3af6
import
os
import
sys
import
re
from
os
import
path
import
copy
...
...
@@ -14,7 +13,6 @@ from langchain.embeddings.huggingface import (
)
import
math
import
faiss
from
langchain.vectorstores.utils
import
DistanceStrategy
from
langchain.vectorstores.base
import
VectorStoreRetriever
from
langchain.callbacks.manager
import
(
AsyncCallbackManagerForRetrieverRun
,
...
...
@@ -23,9 +21,9 @@ from langchain.callbacks.manager import (
from
src.loader
import
load
from
langchain.embeddings.base
import
Embeddings
from
src.pgdb.knowledge.callback
import
DocumentCallback
,
DefaultDocumentCallback
import
operator
from
langchain.vectorstores.utils
import
DistanceStrategy
import
numpy
as
np
from
modelscope.pipelines
import
pipeline
from
modelscope.utils.constant
import
Tasks
from
src.config.consts
import
NLP_BERT_PATH
sys
.
path
.
append
(
"../"
)
...
...
@@ -55,9 +53,6 @@ def get_embding(_path: str) -> Embeddings:
return
EmbeddingFactory
(
_path
)
.
get_embedding
()
class
RE_FAISS
(
FAISS
):
# 去重,并保留metadate
@staticmethod
...
...
@@ -180,7 +175,8 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id
=
{})
else
:
print
(
"load_local faiss"
)
_faiss
=
RE_FAISS
.
load_local
(
folder_path
=
store_path
,
index_name
=
index_name
,
embeddings
=
embeddings
,
allow_dangerous_deserialization
=
True
)
_faiss
=
RE_FAISS
.
load_local
(
folder_path
=
store_path
,
index_name
=
index_name
,
embeddings
=
embeddings
,
allow_dangerous_deserialization
=
True
)
if
docstore1
and
is_pgsql
:
# 如果外部参数调整,更新docstore
_faiss
.
docstore
=
docstore1
return
_faiss
...
...
@@ -257,8 +253,14 @@ class VectorStore_FAISS(FAISS):
if
self
.
doc_callback
:
new_docs
=
self
.
doc_callback
.
before_store
(
self
.
_faiss
.
docstore
,
new_docs
)
if
need_split
:
p
=
pipeline
(
task
=
Tasks
.
document_segmentation
,
model
=
NLP_BERT_PATH
,
model_revision
=
'v1.0.1'
)
for
doc
in
new_docs
:
words_list
=
re
.
split
(
pattern
,
doc
.
page_content
)
# words_list = re.split(pattern, doc.page_content)
words_list
=
p
(
documents
=
doc
.
page_content
)[
'text'
]
.
split
(
'
\n\t
'
)
# 去掉重复项
words_list
=
set
(
words_list
)
words_list
=
[
str
(
words
)
for
words
in
words_list
]
...
...
@@ -269,6 +271,9 @@ class VectorStore_FAISS(FAISS):
list_of_documents
.
append
(
Document
(
page_content
=
words
,
metadata
=
metadata
))
else
:
list_of_documents
=
new_docs
print
(
"===================================================================================="
)
print
(
list_of_documents
)
print
(
"===================================================================================="
)
self
.
_faiss
.
add_documents
(
list_of_documents
)
def
_add_documents_from_dir
(
self
,
filepaths
=
None
,
load_kwargs
=
None
):
...
...
This diff is collapsed.
Click to expand it.
test/file_load_test.py
0 → 100644
View file @
d1dd3af6
# -*- coding: utf-8 -*-
# 模型分割效果测试
from
modelscope.outputs
import
OutputKeys
from
modelscope.pipelines
import
pipeline
from
modelscope.utils.constant
import
Tasks
p
=
pipeline
(
task
=
Tasks
.
document_segmentation
,
model
=
"C:
\\
Users
\\
15663
\\
AI
\\
models
\\
nlp_bert_document-segmentation_chinese-base"
,
model_revision
=
'v1.0.1'
)
result
=
p
(
documents
=
'移动端语音唤醒模型,检测关键词为“小云小云”。模型主体为4层FSMN结构,使用CTC训练准则,参数量750K,适用于移动端设备运行。模型输入为Fbank特征,输出为基于char建模的中文全集token预测,测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式,basetrain过程使用大量内部移动端数据,在此基础上,使用1万条设备端录制安静场景“小云小云”数据进行微调,得到最终面向业务的模型。后续用户可在basetrain模型基础上,使用其他关键词数据进行微调,得到新的语音唤醒模型,但暂时未开放模型finetune功能。'
)
print
(
result
[
'text'
]
.
split
(
'
\n
'
))
print
(
type
([
OutputKeys
.
TEXT
]),
len
(
result
[
OutputKeys
.
TEXT
]))
# 模型下载
# from modelscope import snapshot_download
# model_dir = snapshot_download('iic/nlp_bert_document-segmentation_chinese-base')
\ No newline at end of file
This diff is collapsed.
Click to expand it.
test/lk_test.py
View file @
d1dd3af6
class
Solution
:
@staticmethod
def
numDecodings
(
s
:
str
)
->
int
:
length
=
len
(
s
)
ans
=
[
0
for
i
in
range
(
length
+
1
)]
ans
[
0
]
=
0
ans
[
1
]
=
1
for
i
in
range
(
1
,
length
+
1
):
print
(
i
)
if
s
[
i
-
1
]
+
s
[
i
]
==
'10'
or
s
[
i
-
1
]
+
s
[
i
]
==
'11'
or
s
[
i
-
1
]
+
s
[
i
]
==
'12'
or
s
[
i
-
1
]
+
s
[
i
]
==
'13'
or
s
[
i
-
1
]
+
s
[
i
]
==
'14'
or
s
[
i
-
1
]
+
s
[
i
]
==
'15'
or
s
[
i
-
1
]
+
s
[
i
]
==
'16'
or
s
[
i
-
1
]
+
s
[
i
]
==
'17'
or
s
[
i
-
1
]
+
s
[
i
]
==
'18'
or
s
[
i
-
1
]
+
s
[
i
]
==
'19'
or
s
[
i
-
1
]
+
s
[
i
]
==
'20'
or
s
[
i
-
1
]
+
s
[
i
]
==
'21'
or
s
[
i
-
1
]
+
s
[
i
]
==
'22'
or
s
[
i
-
1
]
+
s
[
i
]
==
'23'
or
s
[
i
-
1
]
+
s
[
i
]
==
'24'
or
s
[
i
-
1
]
+
s
[
i
]
==
'25'
or
s
[
i
-
1
]
+
s
[
i
]
==
'26'
:
if
s
[
i
]
==
'0'
:
ans
[
i
]
=
ans
[
i
-
1
]
+
1
else
:
ans
[
i
]
=
ans
[
i
-
1
]
+
2
else
:
ans
[
i
]
=
ans
[
i
-
1
]
+
1
print
(
ans
)
return
ans
[
length
-
1
]
Solution
.
numDecodings
(
"226"
)
\ No newline at end of file
"""给定三个字符串 s1、s2、s3,请你帮忙验证 s3 是否是由 s1 和 s2 交错 组成的。
两个字符串 s 和 t 交错 的定义与过程如下,其中每个字符串都会被分割成若干 非空
子字符串
:
s = s1 + s2 + ... + sn
t = t1 + t2 + ... + tm
|n - m| <= 1
交错 是 s1 + t1 + s2 + t2 + s3 + t3 + ... 或者 t1 + s1 + t2 + s2 + t3 + s3 + ...
注意:a + b 意味着字符串 a 和 b 连接。
示例 1:
输入:s1 = "aabcc", s2 = "dbbca", s3 = "aadbbcbcac"
输出:true
示例 2:
输入:s1 = "aabcc", s2 = "dbbca", s3 = "aadbbbaccc"
输出:false
示例 3:
输入:s1 = "", s2 = "", s3 = ""
输出:true
提示:
0 <= s1.length, s2.length <= 100
0 <= s3.length <= 200
s1、s2、和 s3 都由小写英文字母组成"""
# Definition for a binary tree node.
class
TreeNode
(
object
):
def
__init__
(
self
,
val
=
0
,
left
=
None
,
right
=
None
):
self
.
val
=
val
self
.
left
=
left
self
.
right
=
right
class
Solution
(
object
):
def
isValidBST
(
self
,
root
):
"""
:type root: TreeNode
:rtype: bool
"""
if
root
is
None
:
return
True
if
root
.
left
is
None
and
root
.
right
is
None
:
return
True
if
root
.
left
is
None
:
return
self
.
isValidBST
(
root
.
right
)
and
root
.
val
<
root
.
right
.
val
if
root
.
right
is
None
:
return
self
.
isValidBST
(
root
.
left
)
and
root
.
val
>
root
.
left
.
val
return
self
.
isValidBST
(
root
.
left
)
and
self
.
isValidBST
(
root
.
right
)
and
root
.
val
>
root
.
left
.
val
and
root
.
val
<
root
.
right
.
val
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment