Spaces:
Runtime error
Runtime error
# import gradio as gr | |
# gr.load("models/BAAI/bge-m3").launch() | |
import json | |
import faiss | |
import numpy as np | |
import gradio as gr | |
import torch | |
from FlagEmbedding import BGEM3FlagModel | |
import os | |
# Define a function to load the ISCO taxonomy | |
def load_isco_taxonomy(file_path: str) -> list: | |
with open(file_path, "r", encoding="utf-8") as file: | |
isco_data = [json.loads(line.strip()) for line in file] | |
return isco_data | |
# Define a function to create a FAISS index | |
def create_faiss_index(isco_taxonomy, model_name="BAAI/bge-m3"): | |
model = BGEM3FlagModel( | |
model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu" | |
) | |
texts = [str(entry["ESCO_DESCRIPTION"]) for entry in isco_taxonomy] | |
embeddings = model.encode( | |
texts, | |
batch_size=12, | |
max_length=128, | |
return_dense=True, | |
return_sparse=True, | |
return_colbert_vecs=True, | |
)["dense_vecs"] | |
embeddings = np.array(embeddings).astype("float32") | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
faiss.write_index(index, "/data/isco_taxonomy.index") | |
with open("/data/isco_taxonomy_mapping.json", "w") as f: | |
json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f) | |
# Define a function to retrieve and rerank using FAISS | |
def retrieve_and_rerank_faiss(job, model_name="BAAI/bge-m3", top_k=8): | |
# Check if isco_taxonomy.index exists, if not, create it with create_faiss_index | |
if not os.path.exists("/data/isco_taxonomy.index"): | |
isco_taxonomy = load_isco_taxonomy("isco_taxonomy.jsonl") | |
create_faiss_index(isco_taxonomy) | |
index = faiss.read_index("/data/isco_taxonomy.index") | |
with open("/data/isco_taxonomy_mapping.json", "r") as f: | |
isco_taxonomy = json.load(f) | |
model = BGEM3FlagModel( | |
model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu" | |
) | |
query_embedding = model.encode( | |
[job], | |
max_length=128, | |
return_dense=True, | |
return_sparse=True, | |
return_colbert_vecs=True, | |
)["dense_vecs"] | |
query_embedding = np.array(query_embedding).astype("float32") | |
distances, indices = index.search(query_embedding, top_k) | |
# top_documents = [isco_taxonomy[str(idx)] for idx in indices[0]] | |
results = [ | |
[ | |
float(distances[0][i]), | |
isco_taxonomy[str(idx)]["ISCO_CODE_4"], | |
isco_taxonomy[str(idx)]["ISCO_LABEL_4"], | |
isco_taxonomy[str(idx)]["ESCO_OCCUPATION"], | |
isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"], | |
] | |
for i, idx in enumerate(indices[0]) | |
] | |
ranked_results = sorted(results, key=lambda x: x[0], reverse=False) | |
return ranked_results | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
text1 = gr.Textbox(label="Job") | |
# text2 = gr.Textbox(label="Duties") | |
# drop1 = gr.Dropdown([4, 6, 8, 10], label="Number of results") | |
btn = gr.Button("Submit") | |
with gr.Row(): | |
with gr.Column(scale=1, min_width=600): | |
def greet(job): | |
return retrieve_and_rerank_faiss(job) | |
with gr.Accordion(label="Explanation", open=False): | |
gr.Markdown( | |
""" | |
### Overview of the ESCO rank and retrieve application | |
The ESCO rank and retrieve application developed using Gradio and the BAAI/BGE-m3 model via a FAISS vector database represents a novel approach in the realm of information retrieval, particularly in the context of occupational classifications such as the ISCO-08 standard. | |
This application leverages machine learning to semantically process and rank occupation-related documents based on their relevance to user-input job descriptions. | |
### How the Application Works | |
The application is structured into several key components: | |
1. **Data preparation:** The ESCO taxonomy data, which includes descriptions of various occupations and corresponding ISCO codes, is initially loaded and processed. This involves reading from a JSON Lines file, ensuring that each entry is correctly formatted and accessible for subsequent operations. | |
2. **Embedding generation:** Using the BAAI/BGE-m3 model, which is optimized for multilingual information processing and retrieval tasks, embeddings (high-dimensional vector representations) are generated for each occupation description in the ESCO dataset. These embeddings capture the semantic essence of the text, allowing for meaningful comparisons between texts. | |
3. **Index creation and storage:** The generated embeddings are then stored in a Faiss index. [Faiss](https://faiss.ai/) (Facebook AI Similarity Search) is an efficient library for similarity search and clustering of dense vectors. It facilitates rapid retrieval of items whose embeddings are most similar to that of a query vector (e.g., cosine of the angle or euclidian distance between two vectors). | |
4. **Retrieval and Ranking:** When a user submits a job title or description of the job through the Gradio interface, the application: | |
- Generates an embedding for the input using the same BAAI/BGE-m3 model. | |
- Queries the pre-computed FAISS index to retrieve the closest occupation descriptions based on cosine similarity measures between embeddings. | |
- Ranks these descriptions according to their similarity scores and presents the results to the user. | |
### Advantages of the rank and retrieve method | |
#### Enhanced relevance through semantic processing | |
Unlike traditional keyword-based search methods, the rank and retrieve approach uses pre-trained deep learning models to understand the context and semantics of texts. | |
This ensures that the results are not just syntactically but also semantically aligned with the user’s query, thereby increasing the relevance and utility of the retrieved documents. | |
#### Efficiency and scalability | |
By pre-computing embeddings and storing them in a FAISS index, the application can quickly retrieve and rank documents without the need for on-the-fly computation. | |
This makes the system highly efficient and scalable, capable of handling large datasets and high query volumes with minimal latency. | |
#### Avoidance of training on sensitive data | |
One significant advantage of this approach over traditional text classification models is that it does not require training on sensitive or personally identifiable information (PII). | |
Since the model operates solely on public domain occupational descriptions from ESCO, there is no need to train a text classification model and hence no risk of exposing personal data. | |
An important factor given the regulations around data privacy (such as GDPR in Europe) and the ethical considerations of working with PII. | |
#### Adaptability and Multilingual Capability | |
The BAAI/BGE-m3 model's multilingual capabilities mean that the application can function effectively across different languages without the need for separate models or extensive retraining. | |
This adaptability makes it suitable for global deployment, particularly in diverse linguistic and cultural contexts. | |
### Conclusion | |
The rank and retrieve application showcases an advanced use of langauge models in information retrieval, offering a practical, efficient, and privacy-respecting solution for matching job titles (and/or descriptions) with occupational standards like ISCO-08. | |
""" | |
) | |
demo.launch() | |