Commit 4d063424 by 文靖昊

Merge remote-tracking branch 'origin/geo' into geo

parents 824785fb 2c36a162
......@@ -58,4 +58,6 @@ exam/
aaa/
bbb/
ccc/
.env
\ No newline at end of file
.env
lae_pg_data
tmp
\ No newline at end of file
......@@ -30,6 +30,7 @@ CREATE TABLE turn_qa (
chat_id varchar(1000),
question text,
answer text,
similar_docs text,
create_time timestamp(6) DEFAULT current_timestamp,
turn_number int,
is_last int2
......@@ -38,6 +39,7 @@ COMMENT ON COLUMN "turn_qa"."turn_id" IS '会话轮次id';
COMMENT ON COLUMN "turn_qa"."chat_id" IS '会话id';
COMMENT ON COLUMN "turn_qa"."question" IS '该轮会话问题';
COMMENT ON COLUMN "turn_qa"."answer" IS '该轮会话答案';
COMMENT ON COLUMN "turn_qa"."similar_docs" IS '该轮会话相似文档 hash 索引';
COMMENT ON COLUMN "turn_qa"."create_time" IS '该轮会话创建时间,默认为当前时间';
COMMENT ON COLUMN "turn_qa"."turn_number" IS '会话轮数';
COMMENT ON COLUMN "turn_qa"."is_last" IS '是否为最后一轮对话:0=否,1=是';
......
......@@ -5,6 +5,8 @@ from fastapi import FastAPI, Header,Query
from fastapi.middleware.cors import CORSMiddleware
from datetime import datetime,timedelta
from src.pgdb.chat.c_db import UPostgresDB
from src.pgdb.knowledge.k_db import PostgresDB
from src.pgdb.knowledge.txt_doc_table import TxtDoc
import uvicorn
import json
from src.pgdb.chat.crud import CRUD
......@@ -13,6 +15,7 @@ from src.server.get_similarity import QAExt
from src.server.qa import QA
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
import re
from src.controller.request import (
PhoneLoginRequest,
......@@ -49,6 +52,9 @@ c_db = UPostgresDB(host=CHAT_DB_HOST, database=CHAT_DB_DBNAME, user=CHAT_DB_USER
port=CHAT_DB_PORT, )
c_db.connect()
k_db = PostgresDB(host=VEC_DB_HOST, database=VEC_DB_DBNAME, user=VEC_DB_USER, password=VEC_DB_PASSWORD, port=VEC_DB_PORT)
k_db.connect()
vecstore_faiss = VectorStore_FAISS(
embedding_model_name=EMBEEDING_MODEL_PATH,
store_path=FAISS_STORE_PATH,
......@@ -127,7 +133,7 @@ def get_history_by_session_id(session_id:str,token: str = Header(None)):
j["Question"] = h[1]
j["Answer"] = h[2]
j["IsLast"] = h[3]
j["SimilarDocuments"] =[]
j["SimilarDocuments"] = get_similarity_doc(h[4])
history_json.append(j)
history_str = json.dumps(history_json)
return {
......@@ -172,19 +178,26 @@ def question(chat_request: ChatRequest, token: str = Header(None)):
prompt += "问:{}\n答:{}\n\n".format(h[0], h[1])
answer, docs = my_chat.chat_with_history_with_ext(question,ext=matches,history=prompt, with_similarity=True)
docs_json = []
doc_hash = []
for d in docs:
j ={}
j["page_content"] = d.page_content
j["from_file"] = d.metadata["filename"]
j["page_number"] = 0
if "hash" in d.metadata:
doc_hash.append(d.metadata["hash"])
docs_json.append(j)
if len(doc_hash)>0:
hash_str = ",".join(doc_hash)
else:
hash_str = ""
# answer = "test Answer"
if session_id =="":
session_id = crud.create_chat(token, '\t\t', '0')
crud.insert_turn_qa(session_id, question, answer, 0, 1)
crud.insert_turn_qa(session_id, question, answer, 0, 1, hash_str)
else:
last_turn_id = crud.get_last_turn_num(str(session_id))
crud.insert_turn_qa(session_id, question, answer, last_turn_id+1, 1)
crud.insert_turn_qa(session_id, question, answer, last_turn_id+1, 1, hash_str)
return {
'code': 200,
......@@ -217,16 +230,24 @@ def re_generate(chat_request: ReGenerateRequest, token: str = Header(None)):
prompt += "问:{}\n答:{}\n\n".format(h[0], h[1])
answer, docs = my_chat.chat_with_history_with_ext(question,ext=matches, history=prompt, with_similarity=True)
docs_json = []
doc_hash = []
for d in docs:
j = {}
j["page_content"] = d.page_content
j["from_file"] = d.metadata["filename"]
j["page_number"] = 0
docs_json.append(j)
if "hash" in d.metadata:
doc_hash.append(d.metadata["hash"])
if len(doc_hash)>0:
hash_str = ",".join(doc_hash)
else:
hash_str = ""
# answer = "reGenerate Answer"
crud.update_turn_last(str(session_id), last_turn_id )
crud.insert_turn_qa(session_id, question, answer, last_turn_id , 1)
crud.insert_turn_qa(session_id, question, answer, last_turn_id, 1, hash_str)
return {
'code': 200,
......@@ -238,5 +259,28 @@ def re_generate(chat_request: ReGenerateRequest, token: str = Header(None)):
}
}
def get_similarity_doc(similarity_doc_hash: str):
if similarity_doc_hash:
hashs = similarity_doc_hash.split(",")
if not similarity_doc_hash or len(hashs) == 0:
return []
docs = []
txt_doc = TxtDoc(k_db)
for h in hashs:
doc = txt_doc.search(h)
d = Document(page_content=doc[0],metadata=json.loads(doc[1]))
docs.append(d)
return docs_to_json(docs)
def docs_to_json(docs):
docs_json = []
for d in docs:
j = {}
j["page_content"] = d.page_content
j["from_file"] = d.metadata["filename"]
j["page_number"] = 0
docs_json.append(j)
return docs_json
if __name__ == "__main__":
uvicorn.run(app, host='0.0.0.0', port=8088)
......@@ -32,6 +32,7 @@ CREATE TABLE turn_qa (
chat_id varchar(1000),
question text,
answer text,
similar_docs text,
create_time timestamp(6) DEFAULT current_timestamp,
turn_number int,
is_last int2
......@@ -40,6 +41,7 @@ COMMENT ON COLUMN "turn_qa"."turn_id" IS '会话轮次id';
COMMENT ON COLUMN "turn_qa"."chat_id" IS '会话id';
COMMENT ON COLUMN "turn_qa"."question" IS '该轮会话问题';
COMMENT ON COLUMN "turn_qa"."answer" IS '该轮会话答案';
COMMENT ON COLUMN "turn_qa"."similar_docs" IS '该轮会话相似文档 hash 索引';
COMMENT ON COLUMN "turn_qa"."create_time" IS '该轮会话创建时间,默认为当前时间';
COMMENT ON COLUMN "turn_qa"."turn_number" IS '会话轮数';
COMMENT ON COLUMN "turn_qa"."is_last" IS '是否为最后一轮对话:0=否,1=是';
......@@ -86,26 +88,26 @@ class CRUD:
self.db.execute(TABLE_USER)
def get_history(self, _chat_id):
query = f'SELECT turn_number,question,answer,is_last FROM turn_qa WHERE chat_id=(%s) ORDER BY turn_number ASC'
query = f'SELECT turn_number,question,answer,is_last,similar_docs FROM turn_qa WHERE chat_id=(%s) ORDER BY turn_number ASC'
self.db.execute_args(query, (_chat_id,))
ans = self.db.fetchall()
return ans
def get_last_history(self, _chat_id):
query = f'SELECT question,answer FROM turn_qa WHERE chat_id=(%s) and is_last=1 ORDER BY turn_number ASC'
query = f'SELECT question,answer,similar_docs FROM turn_qa WHERE chat_id=(%s) and is_last=1 ORDER BY turn_number ASC'
self.db.execute_args(query, (_chat_id,))
ans = self.db.fetchall()
return ans
def get_last_history_before_turn_id(self, _chat_id,turn_id):
query = f'SELECT question,answer FROM turn_qa WHERE chat_id=(%s) and is_last=1 and turn_number<(%s) ORDER BY turn_number ASC'
query = f'SELECT question,answer,similar_docs FROM turn_qa WHERE chat_id=(%s) and is_last=1 and turn_number<(%s) ORDER BY turn_number ASC'
self.db.execute_args(query, (_chat_id,turn_id))
ans = self.db.fetchall()
return ans
def insert_turn_qa(self, chat_id, question, answer, turn_number, is_last):
query = f'INSERT INTO turn_qa(chat_id, question, answer, turn_number, is_last) VALUES (%s,%s,%s,%s,%s)'
self.db.execute_args(query, (chat_id, question, answer, turn_number, is_last))
def insert_turn_qa(self, chat_id, question, answer, turn_number, is_last, similar_docs=None):
query = f'INSERT INTO turn_qa(chat_id, question, answer, turn_number, is_last, similar_docs) VALUES (%s,%s,%s,%s,%s,%s)'
self.db.execute_args(query, (chat_id, question, answer, turn_number, is_last, similar_docs))
......
......@@ -5,6 +5,7 @@ CREATE TABLE turn_qa (
chat_id varchar(1000),
question text,
answer text,
similar_docs text,
create_time timestamp(6) DEFAULT current_timestamp,
turn_number int,
is_last int2
......@@ -13,6 +14,7 @@ COMMENT ON COLUMN "turn_qa"."turn_id" IS '会话轮次id';
COMMENT ON COLUMN "turn_qa"."chat_id" IS '会话id';
COMMENT ON COLUMN "turn_qa"."question" IS '该轮会话问题';
COMMENT ON COLUMN "turn_qa"."answer" IS '该轮会话答案';
COMMENT ON COLUMN "turn_qa"."similar_docs" IS '该轮会话相似文档 hash 索引';
COMMENT ON COLUMN "turn_qa"."create_time" IS '该轮会话创建时间,默认为当前时间';
COMMENT ON COLUMN "turn_qa"."turn_number" IS '会话轮数';
COMMENT ON COLUMN "turn_qa"."is_last" IS '是否为最后一轮对话:0=否,1=是';
......
......@@ -64,8 +64,10 @@ class PgSqlDocstore(Docstore, AddableMixin):
self.__sub_init__()
anwser = self.VEC_TXT.search(search)
content = self.TXT_DOC.search(anwser[0])
meta = json.loads(content[1])
meta.update({"hash": anwser[0]}) # paragraph_id = hash 插入到metadata中,便于后续根据段落查找
if content:
return Document(page_content=content[0], metadata=json.loads(content[1]))
return Document(page_content=content[0], metadata=meta)
else:
return Document()
......
......@@ -81,4 +81,1563 @@ class Base(ABC):
pass
def similarity(self, query: str, texts: list):
raise NotImplementedError("Please implement encode method!")
\ No newline at end of file
raise NotImplementedError("Please implement encode method!")
# https://github.com/kzhisa/rag-fusion/blob/main/rag_fusion.py
from langchain.load import dumps, loads
MAX_DOCS_FOR_CONTEXT = 8
def reciprocal_rank_fusion(results: list[set]):
"""Rerank docs (Reciprocal Rank Fusion)
Args:
results (list[set]): retrieved documents [(k,docs)]
# k (int, optional): parameter k for RRF. Defaults to 60.
Returns:
ranked_results: list of documents reranked by RRF
"""
fused_scores = {}
for (k,docs) in results:
# Assumes the docs are returned in sorted order of relevance
for rank, doc in enumerate(docs):
doc_str = dumps(doc)
if doc_str not in fused_scores:
fused_scores[doc_str] = 0
fused_scores[doc_str] += 1 / (rank + k)
reranked_results = [
(loads(doc), score)
for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
]
# for TEST (print reranked documentsand scores)
print("Reranked documents: ", len(reranked_results))
for doc in reranked_results:
print('---')
print('Docs: ', ' '.join(doc[0].page_content[:100].split()))
print('RRF score: ', doc[1])
# return only documents
return [x[0] for x in reranked_results[:MAX_DOCS_FOR_CONTEXT]]
# 使用 jieba 分词
from jieba import cut
stopWords = [
'--',
'?',
'“',
'”',
'》',
'--',
'able',
'about',
'above',
'according',
'accordingly',
'across',
'actually',
'after',
'afterwards',
'again',
'against',
"ain't",
'all',
'allow',
'allows',
'almost',
'alone',
'along',
'already',
'also',
'although',
'always',
'am',
'among',
'amongst',
'an',
'and',
'another',
'any',
'anybody',
'anyhow',
'anyone',
'anything',
'anyway',
'anyways',
'anywhere',
'apart',
'appear',
'appreciate',
'appropriate',
'are',
"aren't",
'around',
'as',
"a's",
'aside',
'ask',
'asking',
'associated',
'at',
'available',
'away',
'awfully',
'be',
'became',
'because',
'become',
'becomes',
'becoming',
'been',
'before',
'beforehand',
'behind',
'being',
'believe',
'below',
'beside',
'besides',
'best',
'better',
'between',
'beyond',
'both',
'brief',
'but',
'by',
'came',
'can',
'cannot',
'cant',
"can't",
'cause',
'causes',
'certain',
'certainly',
'changes',
'clearly',
"c'mon",
'co',
'com',
'come',
'comes',
'concerning',
'consequently',
'consider',
'considering',
'contain',
'containing',
'contains',
'corresponding',
'could',
"couldn't",
'course',
"c's",
'currently',
'definitely',
'described',
'despite',
'did',
"didn't",
'different',
'do',
'does',
"doesn't",
'doing',
'done',
"don't",
'down',
'downwards',
'during',
'each',
'edu',
'eg',
'eight',
'either',
'else',
'elsewhere',
'enough',
'entirely',
'especially',
'et',
'etc',
'even',
'ever',
'every',
'everybody',
'everyone',
'everything',
'everywhere',
'ex',
'exactly',
'example',
'except',
'far',
'few',
'fifth',
'first',
'five',
'followed',
'following',
'follows',
'for',
'former',
'formerly',
'forth',
'four',
'from',
'further',
'furthermore',
'get',
'gets',
'getting',
'given',
'gives',
'go',
'goes',
'going',
'gone',
'got',
'gotten',
'greetings',
'had',
"hadn't",
'happens',
'hardly',
'has',
"hasn't",
'have',
"haven't",
'having',
'he',
'hello',
'help',
'hence',
'her',
'here',
'hereafter',
'hereby',
'herein',
"here's",
'hereupon',
'hers',
'herself',
"he's",
'hi',
'him',
'himself',
'his',
'hither',
'hopefully',
'how',
'howbeit',
'however',
"i'd",
'ie',
'if',
'ignored',
"i'll",
"i'm",
'immediate',
'in',
'inasmuch',
'inc',
'indeed',
'indicate',
'indicated',
'indicates',
'inner',
'insofar',
'instead',
'into',
'inward',
'is',
"isn't",
'it',
"it'd",
"it'll",
'its',
"it's",
'itself',
"i've",
'just',
'keep',
'keeps',
'kept',
'know',
'known',
'knows',
'last',
'lately',
'later',
'latter',
'latterly',
'least',
'less',
'lest',
'let',
"let's",
'like',
'liked',
'likely',
'little',
'look',
'looking',
'looks',
'ltd',
'mainly',
'many',
'may',
'maybe',
'me',
'mean',
'meanwhile',
'merely',
'might',
'more',
'moreover',
'most',
'mostly',
'much',
'must',
'my',
'myself',
'name',
'namely',
'nd',
'near',
'nearly',
'necessary',
'need',
'needs',
'neither',
'never',
'nevertheless',
'new',
'next',
'nine',
'no',
'nobody',
'non',
'none',
'noone',
'nor',
'normally',
'not',
'nothing',
'novel',
'now',
'nowhere',
'obviously',
'of',
'off',
'often',
'oh',
'ok',
'okay',
'old',
'on',
'once',
'one',
'ones',
'only',
'onto',
'or',
'other',
'others',
'otherwise',
'ought',
'our',
'ours',
'ourselves',
'out',
'outside',
'over',
'overall',
'own',
'particular',
'particularly',
'per',
'perhaps',
'placed',
'please',
'plus',
'possible',
'presumably',
'probably',
'provides',
'que',
'quite',
'qv',
'rather',
'rd',
're',
'really',
'reasonably',
'regarding',
'regardless',
'regards',
'relatively',
'respectively',
'right',
'said',
'same',
'saw',
'say',
'saying',
'says',
'second',
'secondly',
'see',
'seeing',
'seem',
'seemed',
'seeming',
'seems',
'seen',
'self',
'selves',
'sensible',
'sent',
'serious',
'seriously',
'seven',
'several',
'shall',
'she',
'should',
"shouldn't",
'since',
'six',
'so',
'some',
'somebody',
'somehow',
'someone',
'something',
'sometime',
'sometimes',
'somewhat',
'somewhere',
'soon',
'sorry',
'specified',
'specify',
'specifying',
'still',
'sub',
'such',
'sup',
'sure',
'take',
'taken',
'tell',
'tends',
'th',
'than',
'thank',
'thanks',
'thanx',
'that',
'thats',
"that's",
'the',
'their',
'theirs',
'them',
'themselves',
'then',
'thence',
'there',
'thereafter',
'thereby',
'therefore',
'therein',
'theres',
"there's",
'thereupon',
'these',
'they',
"they'd",
"they'll",
"they're",
"they've",
'think',
'third',
'this',
'thorough',
'thoroughly',
'those',
'though',
'three',
'through',
'throughout',
'thru',
'thus',
'to',
'together',
'too',
'took',
'toward',
'towards',
'tried',
'tries',
'truly',
'try',
'trying',
"t's",
'twice',
'two',
'un',
'under',
'unfortunately',
'unless',
'unlikely',
'until',
'unto',
'up',
'upon',
'us',
'use',
'used',
'useful',
'uses',
'using',
'usually',
'value',
'various',
'very',
'via',
'viz',
'vs',
'want',
'wants',
'was',
"wasn't",
'way',
'we',
"we'd",
'welcome',
'well',
"we'll",
'went',
'were',
"we're",
"weren't",
"we've",
'what',
'whatever',
"what's",
'when',
'whence',
'whenever',
'where',
'whereafter',
'whereas',
'whereby',
'wherein',
"where's",
'whereupon',
'wherever',
'whether',
'which',
'while',
'whither',
'who',
'whoever',
'whole',
'whom',
"who's",
'whose',
'why',
'will',
'willing',
'wish',
'with',
'within',
'without',
'wonder',
"won't",
'would',
"wouldn't",
'yes',
'yet',
'you',
"you'd",
"you'll",
'your',
"you're",
'yours',
'yourself',
'yourselves',
"you've",
'zero',
'zt',
'ZT',
'zz',
'ZZ',
'一',
'一下',
'一些',
'一切',
'一则',
'一天',
'一定',
'一方面',
'一旦',
'一时',
'一来',
'一样',
'一次',
'一片',
'一直',
'一致',
'一般',
'一起',
'一边',
'一面',
'万一',
'上下',
'上升',
'上去',
'上来',
'上述',
'上面',
'下列',
'下去',
'下来',
'下面',
'不一',
'不久',
'不仅',
'不会',
'不但',
'不光',
'不单',
'不变',
'不只',
'不可',
'不同',
'不够',
'不如',
'不得',
'不怕',
'不惟',
'不成',
'不拘',
'不敢',
'不断',
'不是',
'不比',
'不然',
'不特',
'不独',
'不管',
'不能',
'不要',
'不论',
'不足',
'不过',
'不问',
'与',
'与其',
'与否',
'与此同时',
'专门',
'且',
'两者',
'严格',
'严重',
'个',
'个人',
'个别',
'中小',
'中间',
'丰富',
'临',
'为',
'为主',
'为了',
'为什么',
'为什麽',
'为何',
'为着',
'主张',
'主要',
'举行',
'乃',
'乃至',
'么',
'之',
'之一',
'之前',
'之后',
'之後',
'之所以',
'之类',
'乌乎',
'乎',
'乘',
'也',
'也好',
'也是',
'也罢',
'了',
'了解',
'争取',
'于',
'于是',
'于是乎',
'云云',
'互相',
'产生',
'人们',
'人家',
'什么',
'什么样',
'什麽',
'今后',
'今天',
'今年',
'今後',
'仍然',
'从',
'从事',
'从而',
'他',
'他人',
'他们',
'他的',
'代替',
'以',
'以上',
'以下',
'以为',
'以便',
'以免',
'以前',
'以及',
'以后',
'以外',
'以後',
'以来',
'以至',
'以至于',
'以致',
'们',
'任',
'任何',
'任凭',
'任务',
'企图',
'伟大',
'似乎',
'似的',
'但',
'但是',
'何',
'何况',
'何处',
'何时',
'作为',
'你',
'你们',
'你的',
'使得',
'使用',
'例如',
'依',
'依照',
'依靠',
'促进',
'保持',
'俺',
'俺们',
'倘',
'倘使',
'倘或',
'倘然',
'倘若',
'假使',
'假如',
'假若',
'做到',
'像',
'允许',
'充分',
'先后',
'先後',
'先生',
'全部',
'全面',
'兮',
'共同',
'关于',
'其',
'其一',
'其中',
'其二',
'其他',
'其余',
'其它',
'其实',
'其次',
'具体',
'具体地说',
'具体说来',
'具有',
'再者',
'再说',
'冒',
'冲',
'决定',
'况且',
'准备',
'几',
'几乎',
'几时',
'凭',
'凭借',
'出去',
'出来',
'出现',
'分别',
'则',
'别',
'别的',
'别说',
'到',
'前后',
'前者',
'前进',
'前面',
'加之',
'加以',
'加入',
'加强',
'十分',
'即',
'即令',
'即使',
'即便',
'即或',
'即若',
'却不',
'原来',
'又',
'及',
'及其',
'及时',
'及至',
'双方',
'反之',
'反应',
'反映',
'反过来',
'反过来说',
'取得',
'受到',
'变成',
'另',
'另一方面',
'另外',
'只是',
'只有',
'只要',
'只限',
'叫',
'叫做',
'召开',
'叮咚',
'可',
'可以',
'可是',
'可能',
'可见',
'各',
'各个',
'各人',
'各位',
'各地',
'各种',
'各级',
'各自',
'合理',
'同',
'同一',
'同时',
'同样',
'后来',
'后面',
'向',
'向着',
'吓',
'吗',
'否则',
'吧',
'吧哒',
'吱',
'呀',
'呃',
'呕',
'呗',
'呜',
'呜呼',
'呢',
'周围',
'呵',
'呸',
'呼哧',
'咋',
'和',
'咚',
'咦',
'咱',
'咱们',
'咳',
'哇',
'哈',
'哈哈',
'哉',
'哎',
'哎呀',
'哎哟',
'哗',
'哟',
'哦',
'哩',
'哪',
'哪个',
'哪些',
'哪儿',
'哪天',
'哪年',
'哪怕',
'哪样',
'哪边',
'哪里',
'哼',
'哼唷',
'唉',
'啊',
'啐',
'啥',
'啦',
'啪达',
'喂',
'喏',
'喔唷',
'嗡嗡',
'嗬',
'嗯',
'嗳',
'嘎',
'嘎登',
'嘘',
'嘛',
'嘻',
'嘿',
'因',
'因为',
'因此',
'因而',
'固然',
'在',
'在下',
'地',
'坚决',
'坚持',
'基本',
'处理',
'复杂',
'多',
'多少',
'多数',
'多次',
'大力',
'大多数',
'大大',
'大家',
'大批',
'大约',
'大量',
'失去',
'她',
'她们',
'她的',
'好的',
'好象',
'如',
'如上所述',
'如下',
'如何',
'如其',
'如果',
'如此',
'如若',
'存在',
'宁',
'宁可',
'宁愿',
'宁肯',
'它',
'它们',
'它们的',
'它的',
'安全',
'完全',
'完成',
'实现',
'实际',
'宣布',
'容易',
'密切',
'对',
'对于',
'对应',
'将',
'少数',
'尔后',
'尚且',
'尤其',
'就',
'就是',
'就是说',
'尽',
'尽管',
'属于',
'岂但',
'左右',
'巨大',
'巩固',
'己',
'已经',
'帮助',
'常常',
'并',
'并不',
'并不是',
'并且',
'并没有',
'广大',
'广泛',
'应当',
'应用',
'应该',
'开外',
'开始',
'开展',
'引起',
'强烈',
'强调',
'归',
'当',
'当前',
'当时',
'当然',
'当着',
'形成',
'彻底',
'彼',
'彼此',
'往',
'往往',
'待',
'後来',
'後面',
'得',
'得出',
'得到',
'心里',
'必然',
'必要',
'必须',
'怎',
'怎么',
'怎么办',
'怎么样',
'怎样',
'怎麽',
'总之',
'总是',
'总的来看',
'总的来说',
'总的说来',
'总结',
'总而言之',
'恰恰相反',
'您',
'意思',
'愿意',
'慢说',
'成为',
'我',
'我们',
'我的',
'或',
'或是',
'或者',
'战斗',
'所',
'所以',
'所有',
'所谓',
'打',
'扩大',
'把',
'抑或',
'拿',
'按',
'按照',
'换句话说',
'换言之',
'据',
'掌握',
'接着',
'接著',
'故',
'故此',
'整个',
'方便',
'方面',
'旁人',
'无宁',
'无法',
'无论',
'既',
'既是',
'既然',
'时候',
'明显',
'明确',
'是',
'是否',
'是的',
'显然',
'显著',
'普通',
'普遍',
'更加',
'曾经',
'替',
'最后',
'最大',
'最好',
'最後',
'最近',
'最高',
'有',
'有些',
'有关',
'有利',
'有力',
'有所',
'有效',
'有时',
'有点',
'有的',
'有着',
'有著',
'望',
'朝',
'朝着',
'本',
'本着',
'来',
'来着',
'极了',
'构成',
'果然',
'果真',
'某',
'某个',
'某些',
'根据',
'根本',
'欢迎',
'正在',
'正如',
'正常',
'此',
'此外',
'此时',
'此间',
'毋宁',
'每',
'每个',
'每天',
'每年',
'每当',
'比',
'比如',
'比方',
'比较',
'毫不',
'没有',
'沿',
'沿着',
'注意',
'深入',
'清楚',
'满足',
'漫说',
'焉',
'然则',
'然后',
'然後',
'然而',
'照',
'照着',
'特别是',
'特殊',
'特点',
'现代',
'现在',
'甚么',
'甚而',
'甚至',
'用',
'由',
'由于',
'由此可见',
'的',
'的话',
'目前',
'直到',
'直接',
'相似',
'相信',
'相反',
'相同',
'相对',
'相对而言',
'相应',
'相当',
'相等',
'省得',
'看出',
'看到',
'看来',
'看看',
'看见',
'真是',
'真正',
'着',
'着呢',
'矣',
'知道',
'确定',
'离',
'积极',
'移动',
'突出',
'突然',
'立即',
'第',
'等',
'等等',
'管',
'紧接着',
'纵',
'纵令',
'纵使',
'纵然',
'练习',
'组成',
'经',
'经常',
'经过',
'结合',
'结果',
'给',
'绝对',
'继续',
'继而',
'维持',
'综上所述',
'罢了',
'考虑',
'者',
'而',
'而且',
'而况',
'而外',
'而已',
'而是',
'而言',
'联系',
'能',
'能否',
'能够',
'腾',
'自',
'自个儿',
'自从',
'自各儿',
'自家',
'自己',
'自身',
'至',
'至于',
'良好',
'若',
'若是',
'若非',
'范围',
'莫若',
'获得',
'虽',
'虽则',
'虽然',
'虽说',
'行为',
'行动',
'表明',
'表示',
'被',
'要',
'要不',
'要不是',
'要不然',
'要么',
'要是',
'要求',
'规定',
'觉得',
'认为',
'认真',
'认识',
'让',
'许多',
'论',
'设使',
'设若',
'该',
'说明',
'诸位',
'谁',
'谁知',
'赶',
'起',
'起来',
'起见',
'趁',
'趁着',
'越是',
'跟',
'转动',
'转变',
'转贴',
'较',
'较之',
'边',
'达到',
'迅速',
'过',
'过去',
'过来',
'运用',
'还是',
'还有',
'这',
'这个',
'这么',
'这么些',
'这么样',
'这么点儿',
'这些',
'这会儿',
'这儿',
'这就是说',
'这时',
'这样',
'这点',
'这种',
'这边',
'这里',
'这麽',
'进入',
'进步',
'进而',
'进行',
'连',
'连同',
'适应',
'适当',
'适用',
'逐步',
'逐渐',
'通常',
'通过',
'造成',
'遇到',
'遭到',
'避免',
'那',
'那个',
'那么',
'那么些',
'那么样',
'那些',
'那会儿',
'那儿',
'那时',
'那样',
'那边',
'那里',
'那麽',
'部分',
'鄙人',
'采取',
'里面',
'重大',
'重新',
'重要',
'鉴于',
'问题',
'防止',
'阿',
'附近',
'限制',
'除',
'除了',
'除此之外',
'除非',
'随',
'随着',
'随著',
'集中',
'需要',
'非但',
'非常',
'非徒',
'靠',
'顺',
'顺着',
'首先',
'高兴',
'是不是',
'说说',
' ',
'about',
'after',
'all',
'also',
'am',
'an',
'and',
'another',
'any',
'are',
'as',
'at',
'be',
'because',
'been',
'before',
'being',
'between',
'both',
'but',
'by',
'came',
'can',
'come',
'could',
'did',
'do',
'each',
'for',
'from',
'get',
'got',
'has',
'had',
'he',
'have',
'her',
'here',
'him',
'himself',
'his',
'how',
'if',
'in',
'into',
'is',
'it',
'like',
'make',
'many',
'me',
'might',
'more',
'most',
'much',
'must',
'my',
'never',
'now',
'of',
'on',
'only',
'or',
'other',
'our',
'out',
'over',
'said',
'same',
'should',
'since',
'some',
'still',
'such',
'take',
'than',
'that',
'the',
'their',
'them',
'then',
'there',
'these',
'they',
'this',
'those',
'through',
'to',
'too',
'under',
'up',
'very',
'was',
'way',
'we',
'well',
'were',
'what',
'where',
'which',
'while',
'who',
'with',
'would',
'you',
'your',
'a',
'i'
]
def jieba_split(text: str) -> str:
tokens = cut(text, True)
return (
item.replace(r'[\u3000-\u303f\uff00-\uffef]', '').strip()
for item in tokens if item and item not in stopWords
) or (text,)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment