Commit 27074f64 by 文靖昊

去除混排,使用重排之后的前三个文档之和作为返回文档

parent 135a0fe7
...@@ -81,44 +81,46 @@ class GetSimilarityWithExt: ...@@ -81,44 +81,46 @@ class GetSimilarityWithExt:
return result return result
def get_rerank_with_doc(self, reranker: BgeRerank,split_docs_list:list): def get_rerank_with_doc(self, reranker: BgeRerank,split_docs_list:list):
top_k = self.get_doc_nums(len(split_docs_list)) # top_k = self.get_doc_nums(len(split_docs_list))
question = '\n'.join(self.question) question = '\n'.join(self.question)
print(question) print(question)
rerank_docs1_hash = [] # rerank_docs1_hash = []
rerank_docs2_hash = [] # rerank_docs2_hash = []
m = {} # m = {}
result = [] result = []
for split_doc in split_docs_list: for split_doc in split_docs_list:
start = time.time() start = time.time()
rerank_docs1 = reranker.compress_documents(split_doc, question) rerank_docs1 = reranker.compress_documents(split_doc, question)
result.extend(rerank_docs1[:3])
end = time.time() end = time.time()
print('重排1 time: %s Seconds' % (end - start)) print('重排1 time: %s Seconds' % (end - start))
for doc in rerank_docs1: # for doc in rerank_docs1:
m[hash(doc.page_content)] = doc # m[hash(doc.page_content)] = doc
rerank_docs1_hash.append(hash(doc.page_content)) # rerank_docs1_hash.append(hash(doc.page_content))
result.append((60, rerank_docs1_hash)) # result.append((60, rerank_docs1_hash))
start = time.time() start = time.time()
rerank_docs2 = reranker.compress_documents(self.similarity_docs, question) rerank_docs2 = reranker.compress_documents(self.similarity_docs, question)
result.extend(rerank_docs2[:3])
end = time.time() end = time.time()
print('重排2 time: %s Seconds' % (end - start)) print('重排2 time: %s Seconds' % (end - start))
for doc in rerank_docs2: # for doc in rerank_docs2:
m[hash(doc.page_content)] = doc # m[hash(doc.page_content)] = doc
rerank_docs2_hash.append(hash(doc.page_content)) # rerank_docs2_hash.append(hash(doc.page_content))
#
result.append((55,rerank_docs2_hash)) # result.append((55,rerank_docs2_hash))
print(len(rerank_docs1_hash)) # print(len(rerank_docs1_hash))
print(len(rerank_docs2_hash)) # print(len(rerank_docs2_hash))
start = time.time() # start = time.time()
rrf_doc = reciprocal_rank_fusion(result) # rrf_doc = reciprocal_rank_fusion(result)
end = time.time() # end = time.time()
print('混排 time: %s Seconds' % (end - start)) # print('混排 time: %s Seconds' % (end - start))
print("混排文档数量:", len(rrf_doc)) # print("混排文档数量:", len(rrf_doc))
d_list = [] # d_list = []
for key in rrf_doc: # for key in rrf_doc:
d_list.append(m[key]) # d_list.append(m[key])
print("返回文档数量:",top_k) # print("返回文档数量:",top_k)
self.rerank_docs = d_list[:top_k] self.rerank_docs = result
return self.join_document(d_list[:top_k]) return self.join_document(result)
def get_similarity_doc(self): def get_similarity_doc(self):
return self.similarity_doc_txt return self.similarity_doc_txt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment