Commit 27074f64 by 文靖昊

去除混排,使用重排之后的前三个文档之和作为返回文档

parent 135a0fe7
......@@ -81,44 +81,46 @@ class GetSimilarityWithExt:
return result
def get_rerank_with_doc(self, reranker: BgeRerank,split_docs_list:list):
top_k = self.get_doc_nums(len(split_docs_list))
# top_k = self.get_doc_nums(len(split_docs_list))
question = '\n'.join(self.question)
print(question)
rerank_docs1_hash = []
rerank_docs2_hash = []
m = {}
# rerank_docs1_hash = []
# rerank_docs2_hash = []
# m = {}
result = []
for split_doc in split_docs_list:
start = time.time()
rerank_docs1 = reranker.compress_documents(split_doc, question)
result.extend(rerank_docs1[:3])
end = time.time()
print('重排1 time: %s Seconds' % (end - start))
for doc in rerank_docs1:
m[hash(doc.page_content)] = doc
rerank_docs1_hash.append(hash(doc.page_content))
result.append((60, rerank_docs1_hash))
# for doc in rerank_docs1:
# m[hash(doc.page_content)] = doc
# rerank_docs1_hash.append(hash(doc.page_content))
# result.append((60, rerank_docs1_hash))
start = time.time()
rerank_docs2 = reranker.compress_documents(self.similarity_docs, question)
result.extend(rerank_docs2[:3])
end = time.time()
print('重排2 time: %s Seconds' % (end - start))
for doc in rerank_docs2:
m[hash(doc.page_content)] = doc
rerank_docs2_hash.append(hash(doc.page_content))
result.append((55,rerank_docs2_hash))
print(len(rerank_docs1_hash))
print(len(rerank_docs2_hash))
start = time.time()
rrf_doc = reciprocal_rank_fusion(result)
end = time.time()
print('混排 time: %s Seconds' % (end - start))
print("混排文档数量:", len(rrf_doc))
d_list = []
for key in rrf_doc:
d_list.append(m[key])
print("返回文档数量:",top_k)
self.rerank_docs = d_list[:top_k]
return self.join_document(d_list[:top_k])
# for doc in rerank_docs2:
# m[hash(doc.page_content)] = doc
# rerank_docs2_hash.append(hash(doc.page_content))
#
# result.append((55,rerank_docs2_hash))
# print(len(rerank_docs1_hash))
# print(len(rerank_docs2_hash))
# start = time.time()
# rrf_doc = reciprocal_rank_fusion(result)
# end = time.time()
# print('混排 time: %s Seconds' % (end - start))
# print("混排文档数量:", len(rrf_doc))
# d_list = []
# for key in rrf_doc:
# d_list.append(m[key])
# print("返回文档数量:",top_k)
self.rerank_docs = result
return self.join_document(result)
def get_similarity_doc(self):
return self.similarity_doc_txt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment