File size: 4,797 Bytes
86d39de
3f40561
828614b
3f40561
4662c92
3f40561
828614b
 
1de6917
e7dcd52
3f40561
6b27907
3f40561
 
 
86d39de
3f40561
 
 
828614b
 
 
 
 
 
deb2d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828614b
 
 
 
 
 
 
 
 
 
 
4662c92
 
 
 
 
828614b
9541b76
828614b
6b27907
 
 
828614b
 
 
 
 
4662c92
 
 
 
 
e8f61e6
a990647
1957c1c
e8f61e6
a990647
 
4ebabb8
e8f61e6
a990647
86d39de
4ebabb8
 
 
 
 
 
e47e728
4ebabb8
 
 
 
 
 
 
86d39de
 
4ebabb8
a990647
828614b
4ebabb8
 
 
 
 
 
 
 
828614b
4ebabb8
 
828614b
e3aed43
828614b
e3aed43
 
840b817
e3aed43
86d39de
828614b
86d39de
828614b
86d39de
e3aed43
86d39de
e3aed43
86d39de
 
1de6917
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
from gradio_client import Client
from functools import lru_cache

# Cache the model and tokenizer using lru_cache
@lru_cache(maxsize=1)
def load_model_and_tokenizer():
    model_name = "./all-MiniLM-L6-v2"  # Replace with your Space and model path
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

# Load the model and tokenizer
tokenizer, model = load_model_and_tokenizer()

# Precompute label embeddings
labels = [
    "aerospace", "anatomy", "anthropology", "art", 
    "automotive", "blockchain", "biology", "chemistry", 
    "cryptocurrency", "data science", "design", "e-commerce",
    "education", "engineering", "entertainment", "environment",
    "fashion", "finance", "food commerce", "gaming",
    "healthcare", "history", "information technology", 
    "legal", "machine learning", "marketing", "medicine",
    "music", "philosophy", "physics", "politics", "real estate", "retail", 
    "robotics", "social media", "sports", "technical",
    "tourism", "travel"
]

tones = [
    "formal", "positive", "negative", "poetic", "polite", "subtle", "casual", "neutral",
    "informal", "pompous", "sustained", "rude", "sustained",
    "sophisticated", "playful", "serious", "friendly"
]

styles = [
    "poetry", "novel", "theater", "slang", "speech", "keywords", "html", "programming"
]

gender_number = [
    "masculine singular", "masculine plural", "feminine singular", "feminine plural"
]

@lru_cache(maxsize=1)
def precompute_label_embeddings():
    inputs = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Mean pooling for embeddings

label_embeddings = precompute_label_embeddings()

# Softmax function to convert scores to probabilities
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / exp_x.sum()

# Function to detect context
def detect_context(input_text, threshold=0.03):
    # Encode the input text
    inputs = tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    input_embedding = outputs.last_hidden_state.mean(dim=1).numpy()  # Mean pooling for embedding

    # Compute cosine similarities
    similarities = cosine_similarity(input_embedding, label_embeddings)[0]

    # Apply softmax to convert similarities to probabilities
    probabilities = softmax(similarities)

    # Pair each label with its probability
    label_probabilities = list(zip(labels, probabilities))

    # Filter contexts with confidence >= threshold
    high_confidence_contexts = [(label, score) for label, score in label_probabilities if score >= threshold]

    # If no contexts meet the threshold, default to "general"
    if not high_confidence_contexts:
        high_confidence_contexts = [("general", 1.0)]  # Assign a default score of 1.0 for "general"

    return high_confidence_contexts

# Mock translation clients for different contexts
def get_translation_client(context):
    """
    Returns the appropriate Hugging Face Space client for the given context.
    For now, all contexts use the same mock space.
    """
    return Client("Frenchizer/space_7")  # Replace with actual Space paths for each context

def translate_text(input_text, context):
    """
    Translates the input text using the appropriate model for the given context.
    """
    client = get_translation_client(context)
    return client.predict(input_text)

def process_request(input_text):
    # Step 1: Detect context
    context_results = detect_context(input_text)

    # Step 2: Translate the text for each context
    translations = {}
    for context, score in context_results:
        translations[context] = translate_text(input_text, context)

    # Step 3: Print the list of high-confidence contexts and translations
    print("High-confidence contexts (score >= 0.022):", context_results)
    print("Translations:", translations)

    # Return the translations and contexts
    return translations, context_results

# Gradio interface
def gradio_interface(input_text):
    translation, contexts = process_request(input_text)
    # Format the output
    output = f"{translation}\n"
    return output.strip()

# Create the Gradio interface
interface = gr.Interface(
    fn=gradio_interface,
    inputs="text",
    outputs="text",
    title="Frenchizer",
    description="Translate text from English to French with context detection and threshold."
)

interface.launch()