Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline | |
from sentence_transformers import SentenceTransformer, util | |
import openai | |
import random | |
import re | |
import requests | |
import warnings | |
from transformers import logging | |
import os | |
import tensorflow as tf | |
# Set your OpenAI API key | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 0 = all messages, 1 = INFO, 2 = WARNING, 3 = ERROR | |
tf.get_logger().setLevel('ERROR') | |
# Suppress Python warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings | |
warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings | |
warnings.filterwarnings("ignore") # Suppress all warnings (optional) | |
# Suppress Hugging Face Transformers warnings | |
logging.set_verbosity_error() | |
# GPT-powered sentence segmentation function | |
def segment_into_sentences_groq(passage): | |
headers = { | |
"Authorization": f"Bearer {GROQ_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
payload = { | |
"model": "llama3-8b-8192", | |
"messages": [ | |
{ | |
"role": "system", | |
"content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses" | |
}, | |
{ | |
"role": "user", | |
"content": f"you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses. here is the passage:{passage}" | |
} | |
], | |
"temperature": 1.0, | |
"max_tokens": 8192 | |
} | |
print("response sent") | |
response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers) | |
print("response recieved") | |
if response.status_code == 200: | |
data = response.json() | |
try: | |
segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
print("SOP segmented") | |
# Split sentences based on the custom token | |
sentences = segmented_text.split("1!2@3#") | |
return [sentence.strip() for sentence in sentences if sentence.strip()] | |
except (IndexError, KeyError): | |
raise ValueError("Unexpected response structure from Groq API.") | |
else: | |
raise ValueError(f"Groq API error: {response.text}") | |
class TextEnhancer: | |
def __init__(self): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize paraphrase model | |
self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5") | |
self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device) | |
print("paraphraser loaded") | |
# Initialize grammar correction | |
self.grammar_pipeline = pipeline( | |
"text2text-generation", | |
model="Grammarly/coedit-large", | |
device=0 if self.device == "cuda" else -1 | |
) | |
print("grammar check loaded") | |
# Initialize semantic similarity model | |
self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device) | |
print("sementics model loaded") | |
def enhance_text(self, text, min_similarity=0.8, max_variations=3): | |
# Use GPT for sentence segmentation | |
sentences = segment_into_sentences_groq(text) | |
enhanced_sentences = [] | |
for sentence in sentences: | |
if not sentence.strip(): | |
continue | |
inputs = self.paraphrase_tokenizer( | |
f"paraphrase: {sentence}", | |
return_tensors="pt", | |
padding=True, | |
max_length=150, | |
truncation=True | |
).to(self.device) | |
outputs = self.paraphrase_model.generate( | |
**inputs, | |
max_length=len(sentence.split()) + 20, | |
num_return_sequences=max_variations, | |
num_beams=max_variations, | |
temperature=0.7 | |
) | |
paraphrases = [ | |
self.paraphrase_tokenizer.decode(output, skip_special_tokens=True) | |
for output in outputs | |
] | |
sentence_embedding = self.similarity_model.encode(sentence) | |
paraphrase_embeddings = self.similarity_model.encode(paraphrases) | |
similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings) | |
valid_paraphrases = [ | |
para for para, sim in zip(paraphrases, similarities[0]) | |
if sim >= min_similarity | |
] | |
if valid_paraphrases: | |
corrected = self.grammar_pipeline( | |
valid_paraphrases[0], | |
max_length=150, | |
num_return_sequences=1 | |
)[0]["generated_text"] | |
corrected = self._humanize_text(corrected) | |
enhanced_sentences.append(corrected) | |
else: | |
enhanced_sentences.append(sentence) | |
print(sentence) | |
enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "." | |
return enhanced_text | |
def _humanize_text(self, text): | |
""" | |
Introduce small variations to make text appear more 'human-like' | |
""" | |
# Randomly replace contractions in some sentences | |
contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"} | |
words = text.split() | |
text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words]) | |
# Add optional comma variations for natural breaks | |
if random.random() > 0.7: | |
text = text.replace(" and ", ", and ") | |
# Minor variations in sentence structure | |
if random.random() > 0.5: | |
text = text.replace(" is ", " happens to be ") | |
return text | |
def create_interface(): | |
enhancer = TextEnhancer() | |
def process_text(text, similarity_threshold): | |
try: | |
enhanced = enhancer.enhance_text( | |
text, | |
min_similarity=similarity_threshold / 100 | |
) | |
print("grammar enhanced") | |
return enhanced | |
except Exception as e: | |
return f"Error: {str(e)}" | |
interface = gr.Interface( | |
fn=process_text, | |
inputs=[ | |
gr.Textbox( | |
label="Input Text", | |
placeholder="Enter text to enhance...", | |
lines=10 | |
), | |
gr.Slider( | |
minimum=50, | |
maximum=100, | |
value=80, | |
label="Minimum Semantic Similarity (%)" | |
) | |
], | |
outputs=gr.Textbox(label="Enhanced Text", lines=10), | |
title="Text Enhancement System", | |
description="Improve text quality while preserving original meaning" | |
) | |
return interface | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch() | |