lengocduc195's picture
pushNe
2359bda
raw
history blame
3.23 kB
"""
In this example we train a semantic search model to search through Wikipedia
articles about programming articles & technologies.
We use the text paragraphs from the following Wikipedia articles:
Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
In:
1_programming_query_generation.py - We generate queries for all paragraphs from these articles
2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for sematic search (for the given Wikipedia articles).
3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
"""
import json
import gzip
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import tqdm
import os
from sentence_transformers import util
paragraphs = set()
# We use the Wikipedia articles of certain programming languages
corpus_filepath = 'wiki-programmming-20210101.jsonl.gz'
if not os.path.exists(corpus_filepath):
util.http_get('https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz', corpus_filepath)
with gzip.open(corpus_filepath, 'rt') as fIn:
for line in fIn:
data = json.loads(line.strip())
for p in data['paragraphs']:
if len(p) > 100: #Only take paragraphs with at least 100 chars
paragraphs.add(p)
paragraphs = list(paragraphs)
print("Paragraphs:", len(paragraphs))
# Now we load the model that is able to generate queries given a paragraph.
# This model was trained on the MS MARCO dataset, a dataset with 500k
# queries from Bing and the respective relevant passage
tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model.eval()
#Select the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
# Parameters for generation
batch_size = 8 #Batch size
num_queries = 5 #Number of queries to generate for every paragraph
max_length_paragraph = 300 #Max length for paragraph
max_length_query = 64 #Max length for output query
# Now for every paragraph in our corpus, we generate the queries
with open('generated_queries.tsv', 'w') as fOut:
for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
sub_paragraphs = paragraphs[start_idx:start_idx+batch_size]
inputs = tokenizer.prepare_seq2seq_batch(sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors='pt').to(device)
outputs = model.generate(
**inputs,
max_length=max_length_query,
do_sample=True,
top_p=0.95,
num_return_sequences=num_queries)
for idx, out in enumerate(outputs):
query = tokenizer.decode(out, skip_special_tokens=True)
para = sub_paragraphs[int(idx/num_queries)]
fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))