mrmuminov commited on
Commit
6078730
·
1 Parent(s): 8ac9b4d
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +99 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🌖
4
  colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: Tahrirchi BERT Base - Embedding
 
4
  colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  short_description: Tahrirchi BERT Base - Embedding
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
2
+ import torch
3
+ import gradio as gr
4
+ import numpy as np
5
+ import json
6
+
7
+ class BertEmbeddingsGenerator:
8
+ def __init__(self, model_name="tahrirchi/tahrirchi-bert-base"):
9
+ """Initialize the BERT model and tokenizer."""
10
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ self.model = AutoModelForMaskedLM.from_pretrained(model_name)
12
+ self.model.eval() # Set to evaluation mode
13
+
14
+ def get_embeddings(self, text):
15
+ """
16
+ Generate embeddings for the input text.
17
+
18
+ Args:
19
+ text (str): Input text to embed
20
+
21
+ Returns:
22
+ np.ndarray: Text embeddings
23
+ """
24
+ # Tokenize input text
25
+ inputs = self.tokenizer(
26
+ text,
27
+ return_tensors="pt",
28
+ truncation=True,
29
+ padding=True,
30
+ max_length=512
31
+ )
32
+
33
+ # Generate embeddings
34
+ with torch.no_grad():
35
+ outputs = self.model(**inputs, output_hidden_states=True)
36
+
37
+ # Get the hidden states from the last layer
38
+ # The hidden states tuple contains embeddings from all layers, -1 gets the last layer
39
+ last_hidden_state = outputs.hidden_states[-1]
40
+
41
+ # Average token embeddings to get sentence embedding
42
+ embeddings = last_hidden_state.mean(dim=1)
43
+
44
+ # Convert to numpy and then to list
45
+ return embeddings.squeeze().cpu().numpy()
46
+
47
+ def create_gradio_interface():
48
+ """Create and configure the Gradio interface."""
49
+ # Initialize the embeddings generator
50
+ generator = BertEmbeddingsGenerator()
51
+
52
+ def embed_text(input_text):
53
+ """Gradio interface function."""
54
+ try:
55
+ if not input_text or not input_text.strip():
56
+ return json.dumps({"error": "Please enter some text"})
57
+
58
+ embeddings = generator.get_embeddings(input_text)
59
+
60
+ # Convert numpy array to list and handle NaN/Infinity values
61
+ embeddings_list = np.where(np.isfinite(embeddings), embeddings, None).tolist()
62
+
63
+ # Create a structured output
64
+ output = {
65
+ "embeddings": embeddings_list,
66
+ "dimensions": len(embeddings_list),
67
+ "status": "success"
68
+ }
69
+
70
+ return json.dumps(output, ensure_ascii=False)
71
+
72
+ except Exception as e:
73
+ return json.dumps({
74
+ "error": str(e),
75
+ "status": "error"
76
+ })
77
+
78
+ # Create Gradio interface
79
+ iface = gr.Interface(
80
+ fn=embed_text,
81
+ inputs=gr.Textbox(
82
+ lines=2,
83
+ placeholder="Enter text here...",
84
+ label="Input Text"
85
+ ),
86
+ outputs=gr.JSON(label="Embeddings"),
87
+ title="BERT Text Embeddings Generator",
88
+ description="Generate embeddings from text using tahrirchi-bert-base model",
89
+ examples=[
90
+ ["This is a sample text to generate embeddings."],
91
+ ["Another example text to showcase the embedding generation."]
92
+ ]
93
+ )
94
+ return iface
95
+
96
+ if __name__ == "__main__":
97
+ # Create and launch the interface
98
+ iface = create_gradio_interface()
99
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers==4.38
2
+ torch==2.3.0
3
+ gradio==4.44.1