File size: 3,232 Bytes

2359bda

"""
In this example we train a semantic search model to search through Wikipedia
articles about programming articles & technologies.

We use the text paragraphs from the following Wikipedia articles:
Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js

In:
1_programming_query_generation.py - We generate queries for all paragraphs from these articles
2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for sematic search (for the given Wikipedia articles).
3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
"""
import json
import gzip
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import tqdm
import os
from sentence_transformers import util

paragraphs = set()

# We use the Wikipedia articles of certain programming languages
corpus_filepath = 'wiki-programmming-20210101.jsonl.gz'
if not os.path.exists(corpus_filepath):
    util.http_get('https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz', corpus_filepath)

with gzip.open(corpus_filepath, 'rt') as fIn:
    for line in fIn:
        data = json.loads(line.strip())

        for p in data['paragraphs']:
            if len(p) > 100:    #Only take paragraphs with at least 100 chars
                paragraphs.add(p)

paragraphs = list(paragraphs)
print("Paragraphs:", len(paragraphs))


# Now we load the model that is able to generate queries given a paragraph.
# This model was trained on the MS MARCO dataset, a dataset with 500k
# queries from Bing and the respective relevant passage
tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model.eval()

#Select the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Parameters for generation
batch_size = 8 #Batch size
num_queries = 5 #Number of queries to generate for every paragraph
max_length_paragraph = 300 #Max length for paragraph
max_length_query = 64   #Max length for output query

# Now for every paragraph in our corpus, we generate the queries
with open('generated_queries.tsv', 'w') as fOut:
    for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
        sub_paragraphs = paragraphs[start_idx:start_idx+batch_size]
        inputs = tokenizer.prepare_seq2seq_batch(sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors='pt').to(device)
        outputs = model.generate(
            **inputs,
            max_length=max_length_query,
            do_sample=True,
            top_p=0.95,
            num_return_sequences=num_queries)

        for idx, out in enumerate(outputs):
            query = tokenizer.decode(out, skip_special_tokens=True)
            para = sub_paragraphs[int(idx/num_queries)]
            fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))