Spaces:
Runtime error
Runtime error
import shutil | |
import rank_bm25 | |
import os | |
from rank_bm25 import BM25Okapi | |
def get_bm25_scores(question): | |
file_names = [] | |
corpus = [] | |
for file_name in os.listdir('./data'): | |
with open(f'./data/{file_name}', 'r',encoding='cp437') as f: | |
doc = f.readlines() | |
file_names.append(" ".join(file_name.split("-"))) | |
corpus.append(" ".join(doc)) | |
# import matplotlib.pyplot as plt | |
# plt.hist([len(doc.split()) for doc in corpus], bins=128, range=(0, 5000)) | |
# plt.show() | |
titles = file_names | |
words = [ | |
[word for word in doc.split()] | |
for doc in titles | |
] | |
bm25 = BM25Okapi(words) | |
indexs = list(range(len(words))) | |
tokenized_query = question.split() | |
a = bm25.get_top_n(tokenized_query, indexs, n=5) | |
folder_path = 'retrieval' | |
# Kiểm tra nếu thư mục tồn tại | |
if os.path.exists(folder_path): | |
# Xóa tất cả các file trong thư mục | |
for filename in os.listdir(folder_path): | |
file_path = os.path.join(folder_path, filename) | |
try: | |
if os.path.isfile(file_path) or os.path.islink(file_path): | |
os.unlink(file_path) | |
elif os.path.isdir(file_path): | |
shutil.rmtree(file_path) | |
except Exception as e: | |
print('Failed to delete %s. Reason: %s' % (file_path, e)) | |
else: | |
# Nếu thư mục không tồn tại, tạo thư mục mới | |
os.makedirs(folder_path) | |
# Lưu file mới | |
for i in range(len(a)): | |
file_path = os.path.join(folder_path, titles[a[i]]) | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(corpus[a[i]]) |