mohbay commited on
Commit
3ad2b97
·
verified ·
1 Parent(s): ed06f94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -51,6 +51,7 @@ def arabic_word_tokenize(text):
51
  text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
52
  # Extract only Arabic words (length ≥ 2)
53
  tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
 
54
  return [t for t in tokens if t not in ARABIC_STOPWORDS]
55
 
56
  def prepare_bm25_corpus(questions):
@@ -71,12 +72,16 @@ bm25_model1 = BM25Okapi(bm25_corpus1)
71
  bm25_model2 = BM25Okapi(bm25_corpus2)
72
  bm25_model3 = BM25Okapi(bm25_corpus3)
73
  print("BM25 models initialized!")
 
 
 
 
74
 
75
- def compute_bm25_scores(query, bm25_model):
76
  """Compute BM25 scores for a query"""
77
  query_tokens = arabic_word_tokenize(query)
78
  if not query_tokens:
79
- return np.zeros(len(bm25_model.corpus))
80
 
81
  scores = bm25_model.get_scores(query_tokens)
82
  return scores
@@ -133,9 +138,9 @@ def predict(text):
133
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
134
 
135
  # BM25 scores
136
- bm25_scores1 = compute_bm25_scores(text, bm25_model1)
137
- bm25_scores2 = compute_bm25_scores(text, bm25_model2)
138
- bm25_scores3 = compute_bm25_scores(text, bm25_model3)
139
 
140
  # Word overlap scores
141
  word_overlap1 = compute_word_overlap(text, df_questions)
 
51
  text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
52
  # Extract only Arabic words (length ≥ 2)
53
  tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
54
+
55
  return [t for t in tokens if t not in ARABIC_STOPWORDS]
56
 
57
  def prepare_bm25_corpus(questions):
 
72
  bm25_model2 = BM25Okapi(bm25_corpus2)
73
  bm25_model3 = BM25Okapi(bm25_corpus3)
74
  print("BM25 models initialized!")
75
+ corpus_length1 = len(df_questions)
76
+ corpus_length2 = len(df2_questions)
77
+ corpus_length3 = len(df3_questions)
78
+
79
 
80
+ def compute_bm25_scores(query, bm25_model,corpus_length):
81
  """Compute BM25 scores for a query"""
82
  query_tokens = arabic_word_tokenize(query)
83
  if not query_tokens:
84
+ return np.zeros(corpus_length)
85
 
86
  scores = bm25_model.get_scores(query_tokens)
87
  return scores
 
138
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
139
 
140
  # BM25 scores
141
+ bm25_scores1 = compute_bm25_scores(text, bm25_model1,corpus_length1)
142
+ bm25_scores2 = compute_bm25_scores(text, bm25_model2,corpus_length2)
143
+ bm25_scores3 = compute_bm25_scores(text, bm25_model3,corpus_length3)
144
 
145
  # Word overlap scores
146
  word_overlap1 = compute_word_overlap(text, df_questions)