Spaces:
Sleeping
Sleeping
File size: 7,334 Bytes
6725d4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import gradio as gr
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
from sentence_transformers import SentenceTransformer, util
import openai
import random
import re
import requests
import warnings
from transformers import logging
import os
import tensorflow as tf
# Set your OpenAI API key
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 0 = all messages, 1 = INFO, 2 = WARNING, 3 = ERROR
tf.get_logger().setLevel('ERROR')
# Suppress Python warnings
warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings
warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings
warnings.filterwarnings("ignore") # Suppress all warnings (optional)
# Suppress Hugging Face Transformers warnings
logging.set_verbosity_error()
# GPT-powered sentence segmentation function
def segment_into_sentences_groq(passage):
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "llama3-8b-8192",
"messages": [
{
"role": "system",
"content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses"
},
{
"role": "user",
"content": f"you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses. here is the passage:{passage}"
}
],
"temperature": 1.0,
"max_tokens": 8192
}
print("response sent")
response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
print("response recieved")
if response.status_code == 200:
data = response.json()
try:
segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
print("SOP segmented")
# Split sentences based on the custom token
sentences = segmented_text.split("1!2@3#")
return [sentence.strip() for sentence in sentences if sentence.strip()]
except (IndexError, KeyError):
raise ValueError("Unexpected response structure from Groq API.")
else:
raise ValueError(f"Groq API error: {response.text}")
class TextEnhancer:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize paraphrase model
self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
print("paraphraser loaded")
# Initialize grammar correction
self.grammar_pipeline = pipeline(
"text2text-generation",
model="Grammarly/coedit-large",
device=0 if self.device == "cuda" else -1
)
print("grammar check loaded")
# Initialize semantic similarity model
self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
print("sementics model loaded")
def enhance_text(self, text, min_similarity=0.8, max_variations=3):
# Use GPT for sentence segmentation
sentences = segment_into_sentences_groq(text)
enhanced_sentences = []
for sentence in sentences:
if not sentence.strip():
continue
inputs = self.paraphrase_tokenizer(
f"paraphrase: {sentence}",
return_tensors="pt",
padding=True,
max_length=150,
truncation=True
).to(self.device)
outputs = self.paraphrase_model.generate(
**inputs,
max_length=len(sentence.split()) + 20,
num_return_sequences=max_variations,
num_beams=max_variations,
temperature=0.7
)
paraphrases = [
self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
for output in outputs
]
sentence_embedding = self.similarity_model.encode(sentence)
paraphrase_embeddings = self.similarity_model.encode(paraphrases)
similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings)
valid_paraphrases = [
para for para, sim in zip(paraphrases, similarities[0])
if sim >= min_similarity
]
if valid_paraphrases:
corrected = self.grammar_pipeline(
valid_paraphrases[0],
max_length=150,
num_return_sequences=1
)[0]["generated_text"]
corrected = self._humanize_text(corrected)
enhanced_sentences.append(corrected)
else:
enhanced_sentences.append(sentence)
print(sentence)
enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "."
return enhanced_text
def _humanize_text(self, text):
"""
Introduce small variations to make text appear more 'human-like'
"""
# Randomly replace contractions in some sentences
contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"}
words = text.split()
text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
# Add optional comma variations for natural breaks
if random.random() > 0.7:
text = text.replace(" and ", ", and ")
# Minor variations in sentence structure
if random.random() > 0.5:
text = text.replace(" is ", " happens to be ")
return text
def create_interface():
enhancer = TextEnhancer()
def process_text(text, similarity_threshold):
try:
enhanced = enhancer.enhance_text(
text,
min_similarity=similarity_threshold / 100
)
print("grammar enhanced")
return enhanced
except Exception as e:
return f"Error: {str(e)}"
interface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(
label="Input Text",
placeholder="Enter text to enhance...",
lines=10
),
gr.Slider(
minimum=50,
maximum=100,
value=80,
label="Minimum Semantic Similarity (%)"
)
],
outputs=gr.Textbox(label="Enhanced Text", lines=10),
title="Text Enhancement System",
description="Improve text quality while preserving original meaning"
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch()
|