Jaane commited on
Commit
6725d4c
·
verified ·
1 Parent(s): 37bd192

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
4
+ from sentence_transformers import SentenceTransformer, util
5
+ import openai
6
+ import random
7
+ import re
8
+ import requests
9
+ import warnings
10
+ from transformers import logging
11
+ import os
12
+ import tensorflow as tf
13
+ # Set your OpenAI API key
14
+
15
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 0 = all messages, 1 = INFO, 2 = WARNING, 3 = ERROR
16
+ tf.get_logger().setLevel('ERROR')
17
+
18
+ # Suppress Python warnings
19
+ warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings
20
+ warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings
21
+ warnings.filterwarnings("ignore") # Suppress all warnings (optional)
22
+
23
+ # Suppress Hugging Face Transformers warnings
24
+ logging.set_verbosity_error()
25
+ # GPT-powered sentence segmentation function
26
+ def segment_into_sentences_groq(passage):
27
+ headers = {
28
+ "Authorization": f"Bearer {GROQ_API_KEY}",
29
+ "Content-Type": "application/json"
30
+ }
31
+ payload = {
32
+ "model": "llama3-8b-8192",
33
+ "messages": [
34
+ {
35
+ "role": "system",
36
+ "content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses"
37
+ },
38
+ {
39
+ "role": "user",
40
+ "content": f"you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses. here is the passage:{passage}"
41
+ }
42
+ ],
43
+ "temperature": 1.0,
44
+ "max_tokens": 8192
45
+ }
46
+ print("response sent")
47
+ response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
48
+ print("response recieved")
49
+ if response.status_code == 200:
50
+ data = response.json()
51
+ try:
52
+ segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
53
+ print("SOP segmented")
54
+ # Split sentences based on the custom token
55
+ sentences = segmented_text.split("1!2@3#")
56
+ return [sentence.strip() for sentence in sentences if sentence.strip()]
57
+ except (IndexError, KeyError):
58
+ raise ValueError("Unexpected response structure from Groq API.")
59
+ else:
60
+ raise ValueError(f"Groq API error: {response.text}")
61
+
62
+
63
+
64
+ class TextEnhancer:
65
+ def __init__(self):
66
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
67
+
68
+ # Initialize paraphrase model
69
+ self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
70
+ self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
71
+ print("paraphraser loaded")
72
+ # Initialize grammar correction
73
+ self.grammar_pipeline = pipeline(
74
+ "text2text-generation",
75
+ model="Grammarly/coedit-large",
76
+ device=0 if self.device == "cuda" else -1
77
+ )
78
+ print("grammar check loaded")
79
+ # Initialize semantic similarity model
80
+ self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
81
+ print("sementics model loaded")
82
+
83
+ def enhance_text(self, text, min_similarity=0.8, max_variations=3):
84
+ # Use GPT for sentence segmentation
85
+ sentences = segment_into_sentences_groq(text)
86
+
87
+ enhanced_sentences = []
88
+
89
+ for sentence in sentences:
90
+ if not sentence.strip():
91
+ continue
92
+
93
+ inputs = self.paraphrase_tokenizer(
94
+ f"paraphrase: {sentence}",
95
+ return_tensors="pt",
96
+ padding=True,
97
+ max_length=150,
98
+ truncation=True
99
+ ).to(self.device)
100
+
101
+ outputs = self.paraphrase_model.generate(
102
+ **inputs,
103
+ max_length=len(sentence.split()) + 20,
104
+ num_return_sequences=max_variations,
105
+ num_beams=max_variations,
106
+ temperature=0.7
107
+ )
108
+
109
+
110
+ paraphrases = [
111
+ self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
112
+ for output in outputs
113
+ ]
114
+
115
+ sentence_embedding = self.similarity_model.encode(sentence)
116
+ paraphrase_embeddings = self.similarity_model.encode(paraphrases)
117
+ similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings)
118
+
119
+ valid_paraphrases = [
120
+ para for para, sim in zip(paraphrases, similarities[0])
121
+ if sim >= min_similarity
122
+ ]
123
+
124
+ if valid_paraphrases:
125
+ corrected = self.grammar_pipeline(
126
+ valid_paraphrases[0],
127
+ max_length=150,
128
+ num_return_sequences=1
129
+ )[0]["generated_text"]
130
+
131
+ corrected = self._humanize_text(corrected)
132
+ enhanced_sentences.append(corrected)
133
+ else:
134
+ enhanced_sentences.append(sentence)
135
+ print(sentence)
136
+
137
+ enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "."
138
+ return enhanced_text
139
+
140
+ def _humanize_text(self, text):
141
+ """
142
+ Introduce small variations to make text appear more 'human-like'
143
+ """
144
+ # Randomly replace contractions in some sentences
145
+ contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"}
146
+ words = text.split()
147
+ text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
148
+
149
+ # Add optional comma variations for natural breaks
150
+ if random.random() > 0.7:
151
+ text = text.replace(" and ", ", and ")
152
+
153
+ # Minor variations in sentence structure
154
+ if random.random() > 0.5:
155
+ text = text.replace(" is ", " happens to be ")
156
+
157
+ return text
158
+
159
+
160
+ def create_interface():
161
+ enhancer = TextEnhancer()
162
+
163
+ def process_text(text, similarity_threshold):
164
+ try:
165
+ enhanced = enhancer.enhance_text(
166
+ text,
167
+ min_similarity=similarity_threshold / 100
168
+ )
169
+ print("grammar enhanced")
170
+ return enhanced
171
+ except Exception as e:
172
+ return f"Error: {str(e)}"
173
+
174
+ interface = gr.Interface(
175
+ fn=process_text,
176
+ inputs=[
177
+ gr.Textbox(
178
+ label="Input Text",
179
+ placeholder="Enter text to enhance...",
180
+ lines=10
181
+ ),
182
+ gr.Slider(
183
+ minimum=50,
184
+ maximum=100,
185
+ value=80,
186
+ label="Minimum Semantic Similarity (%)"
187
+ )
188
+ ],
189
+ outputs=gr.Textbox(label="Enhanced Text", lines=10),
190
+ title="Text Enhancement System",
191
+ description="Improve text quality while preserving original meaning"
192
+ )
193
+
194
+ return interface
195
+
196
+
197
+ if __name__ == "__main__":
198
+ interface = create_interface()
199
+ interface.launch()