File size: 4,839 Bytes

2359bda

"""
This script contains an example how to perform semantic search with ElasticSearch.

As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions:
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs

Questions are indexed to ElasticSearch together with their respective sentence
embeddings.

The script shows results from BM25 as well as from semantic search with
cosine similarity.

You need ElasticSearch (https://www.elastic.co/de/elasticsearch/) up and running. Further, you need the Python
ElasticSearch Client installed: https://elasticsearch-py.readthedocs.io/en/master/

As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
return the closest questions in the corpus (questions in the corpus are mainly in English).
"""

from sentence_transformers import SentenceTransformer, util
import os
from elasticsearch import Elasticsearch, helpers
import csv
import time
import tqdm.autonotebook



es = Elasticsearch()

model = SentenceTransformer('quora-distilbert-multilingual')

url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 100000

#Download dataset if needed
if not os.path.exists(dataset_path):
    print("Download dataset")
    util.http_get(url, dataset_path)

#Get all unique sentences from the file
all_questions = {}
with open(dataset_path, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        all_questions[row['qid1']] = row['question1']
        if len(all_questions) >= max_corpus_size:
            break

        all_questions[row['qid2']] = row['question2']
        if len(all_questions) >= max_corpus_size:
            break

qids = list(all_questions.keys())
questions = [all_questions[qid] for qid in qids]

#Index data, if the index does not exists
if not es.indices.exists(index="quora"):
    try:
        es_index = {
            "mappings": {
              "properties": {
                "question": {
                  "type": "text"
                },
                "question_vector": {
                  "type": "dense_vector",
                  "dims": 768
                }
              }
            }
        }

        es.indices.create(index='quora', body=es_index, ignore=[400])
        chunk_size = 500
        print("Index data (you can stop it by pressing Ctrl+C once):")
        with tqdm.tqdm(total=len(qids)) as pbar:
            for start_idx in range(0, len(qids), chunk_size):
                end_idx = start_idx+chunk_size

                embeddings = model.encode(questions[start_idx:end_idx], show_progress_bar=False)
                bulk_data = []
                for qid, question, embedding in zip(qids[start_idx:end_idx], questions[start_idx:end_idx], embeddings):
                    bulk_data.append({
                            "_index": 'quora',
                            "_id": qid,
                            "_source": {
                                "question": question,
                                "question_vector": embedding
                            }
                        })

                helpers.bulk(es, bulk_data)
                pbar.update(chunk_size)

    except:
        print("During index an exception occured. Continue\n\n")




#Interactive search queries
while True:
    inp_question = input("Please enter a question: ")

    encode_start_time = time.time()
    question_embedding = model.encode(inp_question)
    encode_end_time = time.time()

    #Lexical search
    bm25 = es.search(index="quora", body={"query": {"match": {"question": inp_question }}})

    #Sematic search
    sem_search = es.search(index="quora", body={
          "query": {
            "script_score": {
              "query": {
                "match_all": {}
              },
              "script": {
                "source": "cosineSimilarity(params.queryVector, doc['question_vector']) + 1.0",
                "params": {
                  "queryVector": question_embedding
                }
              }
            }
          }
        })

    print("Input question:", inp_question)
    print("Computing the embedding took {:.3f} seconds, BM25 search took {:.3f} seconds, semantic search with ES took {:.3f} seconds".format(encode_end_time-encode_start_time, bm25['took']/1000, sem_search['took']/1000))

    print("BM25 results:")
    for hit in bm25['hits']['hits'][0:5]:
        print("\t{}".format(hit['_source']['question']))

    print("\nSemantic Search results:")
    for hit in sem_search['hits']['hits'][0:5]:
        print("\t{}".format(hit['_source']['question']))

    print("\n\n========\n")