Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -51,6 +51,7 @@ def arabic_word_tokenize(text):
|
|
51 |
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
52 |
# Extract only Arabic words (length ≥ 2)
|
53 |
tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
|
|
|
54 |
return [t for t in tokens if t not in ARABIC_STOPWORDS]
|
55 |
|
56 |
def prepare_bm25_corpus(questions):
|
@@ -71,12 +72,16 @@ bm25_model1 = BM25Okapi(bm25_corpus1)
|
|
71 |
bm25_model2 = BM25Okapi(bm25_corpus2)
|
72 |
bm25_model3 = BM25Okapi(bm25_corpus3)
|
73 |
print("BM25 models initialized!")
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
def compute_bm25_scores(query, bm25_model):
|
76 |
"""Compute BM25 scores for a query"""
|
77 |
query_tokens = arabic_word_tokenize(query)
|
78 |
if not query_tokens:
|
79 |
-
return np.zeros(
|
80 |
|
81 |
scores = bm25_model.get_scores(query_tokens)
|
82 |
return scores
|
@@ -133,9 +138,9 @@ def predict(text):
|
|
133 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
134 |
|
135 |
# BM25 scores
|
136 |
-
bm25_scores1 = compute_bm25_scores(text, bm25_model1)
|
137 |
-
bm25_scores2 = compute_bm25_scores(text, bm25_model2)
|
138 |
-
bm25_scores3 = compute_bm25_scores(text, bm25_model3)
|
139 |
|
140 |
# Word overlap scores
|
141 |
word_overlap1 = compute_word_overlap(text, df_questions)
|
|
|
51 |
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
52 |
# Extract only Arabic words (length ≥ 2)
|
53 |
tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
|
54 |
+
|
55 |
return [t for t in tokens if t not in ARABIC_STOPWORDS]
|
56 |
|
57 |
def prepare_bm25_corpus(questions):
|
|
|
72 |
bm25_model2 = BM25Okapi(bm25_corpus2)
|
73 |
bm25_model3 = BM25Okapi(bm25_corpus3)
|
74 |
print("BM25 models initialized!")
|
75 |
+
corpus_length1 = len(df_questions)
|
76 |
+
corpus_length2 = len(df2_questions)
|
77 |
+
corpus_length3 = len(df3_questions)
|
78 |
+
|
79 |
|
80 |
+
def compute_bm25_scores(query, bm25_model,corpus_length):
|
81 |
"""Compute BM25 scores for a query"""
|
82 |
query_tokens = arabic_word_tokenize(query)
|
83 |
if not query_tokens:
|
84 |
+
return np.zeros(corpus_length)
|
85 |
|
86 |
scores = bm25_model.get_scores(query_tokens)
|
87 |
return scores
|
|
|
138 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
139 |
|
140 |
# BM25 scores
|
141 |
+
bm25_scores1 = compute_bm25_scores(text, bm25_model1,corpus_length1)
|
142 |
+
bm25_scores2 = compute_bm25_scores(text, bm25_model2,corpus_length2)
|
143 |
+
bm25_scores3 = compute_bm25_scores(text, bm25_model3,corpus_length3)
|
144 |
|
145 |
# Word overlap scores
|
146 |
word_overlap1 = compute_word_overlap(text, df_questions)
|