from transformers import AutoTokenizer, AutoModelForMaskedLM import torch import gradio as gr import numpy as np import json class BertEmbeddingsGenerator: def __init__(self, model_name="tahrirchi/tahrirchi-bert-base"): """Initialize the BERT model and tokenizer.""" self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForMaskedLM.from_pretrained(model_name) self.model.eval() # Set to evaluation mode def get_embeddings(self, text): """ Generate embeddings for the input text. Args: text (str): Input text to embed Returns: np.ndarray: Text embeddings """ # Tokenize input text inputs = self.tokenizer( text, return_tensors="pt", truncation=True, padding=True, max_length=512 ) # Generate embeddings with torch.no_grad(): outputs = self.model(**inputs, output_hidden_states=True) # Get the hidden states from the last layer # The hidden states tuple contains embeddings from all layers, -1 gets the last layer last_hidden_state = outputs.hidden_states[-1] # Average token embeddings to get sentence embedding embeddings = last_hidden_state.mean(dim=1) # Convert to numpy and then to list return embeddings.squeeze().cpu().numpy() def create_gradio_interface(): """Create and configure the Gradio interface.""" # Initialize the embeddings generator generator = BertEmbeddingsGenerator() def embed_text(input_text): """Gradio interface function.""" try: if not input_text or not input_text.strip(): return json.dumps({"error": "Matn kiritilmadi"}) embeddings = generator.get_embeddings(input_text) # Convert numpy array to list and handle NaN/Infinity values embeddings_list = np.where(np.isfinite(embeddings), embeddings, None).tolist() # Create a structured output output = { "embeddings": embeddings_list, "dimensions": len(embeddings_list), "status": "success" } return json.dumps(output, ensure_ascii=False) except Exception as e: return json.dumps({ "error": str(e), "status": "error" }) # Create Gradio interface iface = gr.Interface( fn=embed_text, inputs=gr.Textbox( lines=2, placeholder="Matn kiriting...", label="Input Text" ), outputs=gr.JSON(label="Embeddings"), title="O'zbek tili uchun embedding", description="O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish", examples=[ ["Assalomu alaykum, men o'zbek tili bilan ishlayman"], ["O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish uchun namuna matn."] ] ) return iface if __name__ == "__main__": # Create and launch the interface iface = create_gradio_interface() iface.launch()