endre sukosd commited on
Commit
1565c8a
·
1 Parent(s): 3992084

Add precalculated embeddings data files tracked with git-lfs

Browse files
Files changed (4) hide show
  1. .gitattributes +0 -2
  2. .gitignore +0 -1
  3. README.md +2 -0
  4. src/app.py +6 -4
.gitattributes CHANGED
@@ -25,5 +25,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
- data/processed/shortened_abstracts_hu_2021_09_01.txt filter=lfs diff=lfs merge=lfs -text
29
- data/processed/shortened_abstracts_hu_2021_09_01_embedded.pt filter=lfs diff=lfs merge=lfs -text
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
.gitignore CHANGED
@@ -1,6 +1,5 @@
1
  # Custom
2
  hf_venv/
3
- data/
4
  *.DS_Store
5
 
6
  # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,venv
 
1
  # Custom
2
  hf_venv/
 
3
  *.DS_Store
4
 
5
  # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,venv
README.md CHANGED
@@ -61,6 +61,8 @@ Model facts:
61
 
62
  To reproduce the precalculated embedding use the notebook in `notebooks/QA_retrieval_precalculate_embeddings.ipynb`, with GPU in Google Colab.
63
 
 
 
64
  ## Search top-k matches
65
 
66
  Finally, having all precalculated embeddings, we can to implement semantic search (dense retrieval).We encode the search query into vector space and retrieves the document embeddings that are closest in vector space (using cosine similarity). By default the top 5 similar wikipedia abstracts are returned. Can be seen in the main script `src/main_qa.py`.
 
61
 
62
  To reproduce the precalculated embedding use the notebook in `notebooks/QA_retrieval_precalculate_embeddings.ipynb`, with GPU in Google Colab.
63
 
64
+ Known bug: the precalculated embeddings contain an extra random tensor in the beginning, thus the total size of 466529 (one more than the number of raw sentences). This is corrected by substracting 1 from the index of the most similar embedding, to find the corresponding raw sentence.
65
+
66
  ## Search top-k matches
67
 
68
  Finally, having all precalculated embeddings, we can to implement semantic search (dense retrieval).We encode the search query into vector space and retrieves the document embeddings that are closest in vector space (using cosine similarity). By default the top 5 similar wikipedia abstracts are returned. Can be seen in the main script `src/main_qa.py`.
src/app.py CHANGED
@@ -26,7 +26,8 @@ def findTopKMostSimilar(query_embedding, embeddings, all_sentences, k):
26
  cosine_scores_list = cosine_scores.squeeze().tolist()
27
  pairs = []
28
  for idx,score in enumerate(cosine_scores_list):
29
- pairs.append({'index': idx, 'score': score, 'text': all_sentences[idx]})
 
30
  pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
31
  return pairs[0:k]
32
 
@@ -49,10 +50,11 @@ embeddings_file = 'data/processed/shortened_abstracts_hu_2021_09_01_embedded.pt'
49
  all_embeddings = load_embeddings(embeddings_file)
50
 
51
 
52
- st.text('Search Wikipedia abstracts in Hungarian - Input some search term and see the top-5 most similar wikipedia abstracts')
53
- st.text('Wikipedia absztrakt kereső - adjon meg egy tetszőleges kifejezést és a rendszer visszaadja az 5 hozzá legjobban hasonlító Wikipedia absztraktot')
 
54
 
55
- input_query = st.text_area("Hol élnek a bengali tigrisek?")
56
 
57
  if input_query:
58
  query_embedding = calculateEmbeddings([input_query],tokenizer,model)
 
26
  cosine_scores_list = cosine_scores.squeeze().tolist()
27
  pairs = []
28
  for idx,score in enumerate(cosine_scores_list):
29
+ if idx < len(all_sentences):
30
+ pairs.append({'score': score, 'text': all_sentences[idx]})
31
  pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
32
  return pairs[0:k]
33
 
 
50
  all_embeddings = load_embeddings(embeddings_file)
51
 
52
 
53
+ st.header('Wikipedia absztrakt kereső')
54
+ st.subheader('Search Wikipedia abstracts in Hungarian')
55
+ st.caption('Input some search term and see the top-5 most similar wikipedia abstracts')
56
 
57
+ input_query = st.text_area("Adjon meg egy tetszőleges kifejezést és a rendszer visszaadja az 5 hozzá legjobban hasonlító Wikipedia absztraktot", value='Hol élnek a bengali tigrisek?')
58
 
59
  if input_query:
60
  query_embedding = calculateEmbeddings([input_query],tokenizer,model)