Spaces:
Sleeping
Sleeping
File size: 4,797 Bytes
86d39de 3f40561 828614b 3f40561 4662c92 3f40561 828614b 1de6917 e7dcd52 3f40561 6b27907 3f40561 86d39de 3f40561 828614b deb2d94 828614b 4662c92 828614b 9541b76 828614b 6b27907 828614b 4662c92 e8f61e6 a990647 1957c1c e8f61e6 a990647 4ebabb8 e8f61e6 a990647 86d39de 4ebabb8 e47e728 4ebabb8 86d39de 4ebabb8 a990647 828614b 4ebabb8 828614b 4ebabb8 828614b e3aed43 828614b e3aed43 840b817 e3aed43 86d39de 828614b 86d39de 828614b 86d39de e3aed43 86d39de e3aed43 86d39de 1de6917 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gradio as gr
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
from gradio_client import Client
from functools import lru_cache
# Cache the model and tokenizer using lru_cache
@lru_cache(maxsize=1)
def load_model_and_tokenizer():
model_name = "./all-MiniLM-L6-v2" # Replace with your Space and model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
return tokenizer, model
# Load the model and tokenizer
tokenizer, model = load_model_and_tokenizer()
# Precompute label embeddings
labels = [
"aerospace", "anatomy", "anthropology", "art",
"automotive", "blockchain", "biology", "chemistry",
"cryptocurrency", "data science", "design", "e-commerce",
"education", "engineering", "entertainment", "environment",
"fashion", "finance", "food commerce", "gaming",
"healthcare", "history", "information technology",
"legal", "machine learning", "marketing", "medicine",
"music", "philosophy", "physics", "politics", "real estate", "retail",
"robotics", "social media", "sports", "technical",
"tourism", "travel"
]
tones = [
"formal", "positive", "negative", "poetic", "polite", "subtle", "casual", "neutral",
"informal", "pompous", "sustained", "rude", "sustained",
"sophisticated", "playful", "serious", "friendly"
]
styles = [
"poetry", "novel", "theater", "slang", "speech", "keywords", "html", "programming"
]
gender_number = [
"masculine singular", "masculine plural", "feminine singular", "feminine plural"
]
@lru_cache(maxsize=1)
def precompute_label_embeddings():
inputs = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).numpy() # Mean pooling for embeddings
label_embeddings = precompute_label_embeddings()
# Softmax function to convert scores to probabilities
def softmax(x):
exp_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
return exp_x / exp_x.sum()
# Function to detect context
def detect_context(input_text, threshold=0.03):
# Encode the input text
inputs = tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
input_embedding = outputs.last_hidden_state.mean(dim=1).numpy() # Mean pooling for embedding
# Compute cosine similarities
similarities = cosine_similarity(input_embedding, label_embeddings)[0]
# Apply softmax to convert similarities to probabilities
probabilities = softmax(similarities)
# Pair each label with its probability
label_probabilities = list(zip(labels, probabilities))
# Filter contexts with confidence >= threshold
high_confidence_contexts = [(label, score) for label, score in label_probabilities if score >= threshold]
# If no contexts meet the threshold, default to "general"
if not high_confidence_contexts:
high_confidence_contexts = [("general", 1.0)] # Assign a default score of 1.0 for "general"
return high_confidence_contexts
# Mock translation clients for different contexts
def get_translation_client(context):
"""
Returns the appropriate Hugging Face Space client for the given context.
For now, all contexts use the same mock space.
"""
return Client("Frenchizer/space_7") # Replace with actual Space paths for each context
def translate_text(input_text, context):
"""
Translates the input text using the appropriate model for the given context.
"""
client = get_translation_client(context)
return client.predict(input_text)
def process_request(input_text):
# Step 1: Detect context
context_results = detect_context(input_text)
# Step 2: Translate the text for each context
translations = {}
for context, score in context_results:
translations[context] = translate_text(input_text, context)
# Step 3: Print the list of high-confidence contexts and translations
print("High-confidence contexts (score >= 0.022):", context_results)
print("Translations:", translations)
# Return the translations and contexts
return translations, context_results
# Gradio interface
def gradio_interface(input_text):
translation, contexts = process_request(input_text)
# Format the output
output = f"{translation}\n"
return output.strip()
# Create the Gradio interface
interface = gr.Interface(
fn=gradio_interface,
inputs="text",
outputs="text",
title="Frenchizer",
description="Translate text from English to French with context detection and threshold."
)
interface.launch() |